profive/detect_overlays.py
2025-07-23 00:09:02 +07:00

62 lines
1.6 KiB
Python

import cv2
import numpy as np
from ultralytics import YOLO
import json
IMAGE_PATH = 'data/test.jpg'
OUTPUT_METADATA = 'data/overlays_metadata.json'
OUTPUT_IMAGE = 'data/overlays_visualized.jpg'
OUTPUT_MASK = 'data/mask.png'
# Load image
image = cv2.imread(IMAGE_PATH)
if image is None:
raise FileNotFoundError(f"Không tìm thấy ảnh: {IMAGE_PATH}")
# get the size of the image
h, w = image.shape[:2]
# create mask
mask = np.zeros((h, w), dtype=np.uint8)
# Load YOLOv8 model
model = YOLO('model/yolo8_seg-speed-bubble.pt')
# Run detection
results = model(IMAGE_PATH)
overlays = []
for result in results:
# detect speech bubbles
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = [int(coordinate) for coordinate in box.xyxy[0]]
conf = float(box.conf[0])
# save metadata
overlays.append({
"type": "textbox",
"position": [x1, y1],
"size": [x2 - x1, y2 - y1],
"confidence": conf
})
# draw bounding box (BGR: 255,0,0)
cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
# create mask
if hasattr(result, 'masks') and result.masks is not None:
for m in result.masks.data:
m = m.cpu().numpy().astype(np.uint8) * 255
# resize mask to image size
m_resized = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST)
mask = cv2.bitwise_or(mask, m_resized)
# metadata
with open(OUTPUT_METADATA, "w") as f:
json.dump(overlays, f, indent=2)
# save image
cv2.imwrite(OUTPUT_IMAGE, image)
cv2.imwrite(OUTPUT_MASK, mask)