import cv2 import numpy as np from ultralytics import YOLO import json IMAGE_PATH = 'data/test.jpg' OUTPUT_METADATA = 'data/overlays_metadata.json' OUTPUT_IMAGE = 'data/overlays_visualized.jpg' OUTPUT_MASK = 'data/mask.png' # Load image image = cv2.imread(IMAGE_PATH) if image is None: raise FileNotFoundError(f"Không tìm thấy ảnh: {IMAGE_PATH}") # get the size of the image h, w = image.shape[:2] # create mask mask = np.zeros((h, w), dtype=np.uint8) # Load YOLOv8 model model = YOLO('model/yolo8_seg-speed-bubble.pt') # Run detection results = model(IMAGE_PATH) overlays = [] for result in results: # detect speech bubbles boxes = result.boxes for box in boxes: x1, y1, x2, y2 = [int(coordinate) for coordinate in box.xyxy[0]] conf = float(box.conf[0]) # save metadata overlays.append({ "type": "textbox", "position": [x1, y1], "size": [x2 - x1, y2 - y1], "confidence": conf }) # draw bounding box (BGR: 255,0,0) cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2) # create mask if hasattr(result, 'masks') and result.masks is not None: for m in result.masks.data: m = m.cpu().numpy().astype(np.uint8) * 255 # resize mask to image size m_resized = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST) mask = cv2.bitwise_or(mask, m_resized) # metadata with open(OUTPUT_METADATA, "w") as f: json.dump(overlays, f, indent=2) # save image cv2.imwrite(OUTPUT_IMAGE, image) cv2.imwrite(OUTPUT_MASK, mask)