profive/detect_overlays.py

import cv2
import numpy as np
from ultralytics import YOLO
import json

IMAGE_PATH = 'data/test.jpg'
OUTPUT_METADATA = 'data/overlays_metadata.json'
OUTPUT_IMAGE = 'data/overlays_visualized.jpg'
OUTPUT_MASK = 'data/mask.png'

# Load image
image = cv2.imread(IMAGE_PATH)
if image is None:
    raise FileNotFoundError(f"Không tìm thấy ảnh: {IMAGE_PATH}")

# get the size of the image
h, w = image.shape[:2]

# create mask
mask = np.zeros((h, w), dtype=np.uint8)

# Load YOLOv8 model
model = YOLO('model/yolo8_seg-speed-bubble.pt')

# Run detection
results = model(IMAGE_PATH)

overlays = []

for result in results:
    # detect speech bubbles
    boxes = result.boxes
    for box in boxes:
        x1, y1, x2, y2 = [int(coordinate) for coordinate in box.xyxy[0]]
        conf = float(box.conf[0])
        # save metadata
        overlays.append({
            "type": "textbox",
            "position": [x1, y1],
            "size": [x2 - x1, y2 - y1],
            "confidence": conf
        })
        # draw bounding box (BGR: 255,0,0)
        cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)

    # create mask
    if hasattr(result, 'masks') and result.masks is not None:
        for m in result.masks.data:
            m = m.cpu().numpy().astype(np.uint8) * 255
            # resize mask to image size
            m_resized = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST)
            mask = cv2.bitwise_or(mask, m_resized)


# metadata
with open(OUTPUT_METADATA, "w") as f:
    json.dump(overlays, f, indent=2)

# save image
cv2.imwrite(OUTPUT_IMAGE, image)
cv2.imwrite(OUTPUT_MASK, mask)