first commit

2025-07-23 00:09:02 +07:00 · 2025-07-23 00:09:02 +07:00 · 9b12364aed
commit 9b12364aed
21 changed files with 3789 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.13
--- a/.specstory/.what-is-this.md
+++ b/.specstory/.what-is-this.md
@ -0,0 +1,69 @@
+# SpecStory Artifacts Directory
+    
+This directory is automatically created and maintained by the SpecStory extension to preserve your AI chat history.
+    
+## What's Here?
+    
+- `.specstory/history`: Contains auto-saved markdown files of your AI coding sessions
+    - Each file represents a separate AI chat session
+    - If you enable auto-save, files are automatically updated as you work
+    - You can enable/disable the auto-save feature in the SpecStory settings, it is disabled by default
+- `.specstory/.project.json`: Contains the persistent project identity for the current workspace
+    - This file is only present if you enable AI rules derivation
+    - This is used to provide consistent project identity of your project, even as the workspace is moved or renamed
+- `.specstory/ai_rules_backups`: Contains backups of the `.cursor/rules/derived-cursor-rules.mdc` or the `.github/copilot-instructions.md` file
+    - Backups are automatically created each time the `.cursor/rules/derived-cursor-rules.mdc` or the `.github/copilot-instructions.md` file is updated
+    - You can enable/disable the AI Rules derivation feature in the SpecStory settings, it is disabled by default
+- `.specstory/.gitignore`: Contains directives to exclude non-essential contents of the `.specstory` directory from version control
+    - Add `/history` to exclude the auto-saved chat history from version control
+
+## Valuable Uses
+    
+- Capture: Keep your context window up-to-date when starting new Chat/Composer sessions via @ references
+- Search: For previous prompts and code snippets 
+- Learn: Meta-analyze your patterns and learn from your past experiences
+- Derive: Keep the AI on course with your past decisions by automatically deriving rules from your AI interactions
+    
+## Version Control
+    
+We recommend keeping this directory under version control to maintain a history of your AI interactions. However, if you prefer not to version these files, you can exclude them by adding this to your `.gitignore`:
+    
+```
+.specstory/**
+```
+
+We recommend __not__ keeping the `.specstory/ai_rules_backups` directory under version control if you are already using git to version your AI rules, and committing regularly. You can exclude it by adding this to your `.gitignore`:
+
+```
+.specstory/ai_rules_backups
+```
+
+## Searching Your Codebase
+    
+When searching your codebase, search results may include your previous AI coding interactions. To focus solely on your actual code files, you can exclude the AI interaction history from search results.
+    
+To exclude AI interaction history:
+    
+1. Open the "Find in Files" search in Cursor or VSCode (Cmd/Ctrl + Shift + F)
+2. Navigate to the "files to exclude" section
+3. Add the following pattern:
+    
+```
+.specstory/*
+```
+    
+This will ensure your searches only return results from your working codebase files.
+
+## Notes
+
+- Auto-save only works when Cursor or VSCode flushes sqlite database data to disk. This results in a small delay after the AI response is complete before SpecStory can save the history.
+
+## Settings
+    
+You can control auto-saving behavior in Cursor or VSCode:
+    
+1. Open Cursor/Code → Settings → VS Code Settings (Cmd/Ctrl + ,)
+2. Search for "SpecStory"
+3. Find "Auto Save" setting to enable/disable
+    
+Auto-save occurs when changes are detected in the sqlite database, or every 2 minutes as a safety net.
--- a/.specstory/history/2025-07-22_05-57Z-hướng-dẫn-setup-môi-trường-dự-án.md
+++ b/.specstory/history/2025-07-22_05-57Z-hướng-dẫn-setup-môi-trường-dự-án.md
--- a/.specstory/history/2025-07-22_14-21Z-tạo-layer-cho-speech-bubble-mượt-viền.md
+++ b/.specstory/history/2025-07-22_14-21Z-tạo-layer-cho-speech-bubble-mượt-viền.md
@ -0,0 +1,83 @@
+<!-- Generated by SpecStory -->
+
+# Tạo layer cho speech bubble mượt viền (2025-07-22 14:21Z)
+
+_**User**_
+
+tôi đã tạo được mask, mask này chỉ highlight các speech bubble màu trắng, viền speech bubble còn thô , có cách nào từ file mask, tạo được số lượng các layer của speech bubble (đã làm mượt viền) không ?  
+
+---
+
+_**Assistant**_
+
+Bạn muốn từ file mask (mask.png) đã có, tách ra từng layer (mỗi layer là một speech bubble riêng biệt), đồng thời làm mượt viền (smooth contour) cho từng bubble. Đây là một bài toán xử lý ảnh khá phổ biến, có thể giải quyết bằng OpenCV và một số bước sau:
+
+### 1. Đọc mask và tiền xử lý
+- Đọc ảnh mask (mask.png), chuyển sang ảnh nhị phân (nếu chưa phải).
+- Làm mượt viền bằng các phép toán morphology (mở, đóng) hoặc Gaussian Blur + threshold lại.
+
+### 2. Tìm các vùng liên thông (connected components)
+- Dùng `cv2.connectedComponents` hoặc `cv2.findContours` để tìm từng vùng (mỗi vùng là một speech bubble).
+
+### 3. Làm mượt viền từng vùng
+- Với mỗi contour, có thể dùng `cv2.approxPolyDP` để làm mượt, hoặc dùng `cv2.GaussianBlur`/`cv2.morphologyEx` trước khi tìm contour.
+
+### 4. Tạo layer cho từng bubble
+- Với mỗi contour, tạo một mask riêng (layer) chỉ chứa vùng đó.
+
+### 5. Lưu từng layer ra file (hoặc trả về list numpy array)
+
+---
+
+## Ví dụ code (Python, OpenCV)
+
+```python
+import cv2
+import numpy as np
+import os
+
+# Đọc mask
+mask = cv2.imread('data/mask.png', cv2.IMREAD_GRAYSCALE)
+
+# Làm mượt viền toàn bộ mask
+blurred = cv2.GaussianBlur(mask, (7, 7), 0)
+_, binary = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY)
+
+# Tìm các vùng liên thông (contour)
+contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+layers = []
+output_dir = 'data/speech_bubble_layers'
+os.makedirs(output_dir, exist_ok=True)
+
+for i, cnt in enumerate(contours):
+    # Làm mượt contour
+    epsilon = 0.01 * cv2.arcLength(cnt, True)
+    smoothed_cnt = cv2.approxPolyDP(cnt, epsilon, True)
+    
+    # Tạo mask cho từng bubble
+    layer = np.zeros_like(mask)
+    cv2.drawContours(layer, [smoothed_cnt], -1, 255, -1)
+    layers.append(layer)
+    
+    # Lưu ra file
+    cv2.imwrite(f'{output_dir}/bubble_{i+1}.png', layer)
+
+print(f'Đã tách được {len(layers)} layer speech bubble, lưu tại {output_dir}')
+```
+
+---
+
+## Giải thích:
+- **Làm mượt viền:** GaussianBlur + threshold giúp loại bỏ noise nhỏ, viền sẽ mượt hơn. `cv2.approxPolyDP` giúp làm mượt contour.
+- **Tách layer:** Mỗi contour là một bubble, vẽ lên mask trắng đen mới.
+- **Lưu file:** Mỗi layer là một file PNG riêng.
+
+---
+
+Nếu bạn muốn xuất ra dạng numpy array hoặc overlay lên ảnh gốc, có thể mở rộng thêm.
+
+Bạn có muốn mình tạo sẵn script hoàn chỉnh cho bạn không? Hoặc bạn muốn xuất ra định dạng nào khác ngoài PNG?
+
+---
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,64 @@
+# Recommended Tech Stack for Apple M4 (ARM) + 16GB RAM with Python & uv
+
+## 1. Hardware-Specific Considerations
+
+- **Apple M4 (ARM, Apple Silicon):** Prioritize native ARM support and pre-built wheels for Python packages.
+- **16GB RAM:** Comfortable for moderate-sized AI inference (text/image) and routine vision tasks, but be mindful with large downloadable models.
+
+---
+
+## 2. Python Environment Setup
+
+- **Continue using uv** for environment and dependency management for isolation and reproducible builds.
+- **Python Version:** 3.11 or higher, with native ARM build (recommended: managed via pyenv or uv’s Python installer).
+
+---
+
+## 3. Core Packages (Optimized for Apple Silicon)
+
+| Function                    | Recommended Packages                              | Notes & Tips                                |
+|-----------------------------|---------------------------------------------------|---------------------------------------------|
+| Image Processing            | pillow, opencv-python                             | These support ARM64; avoid opencv-contrib-python (heavier) unless needed.          |
+| OCR                         | easyocr                                           | PyTesseract possible, but EasyOCR runs on CPU and ARM better.                      |
+| AI Inpainting               | diffusers, torch (Apple Silicon support), transformers | Use PyTorch 2.1+ with Metal support for Apple Silicon.                            |
+| GUI / Interactive Layer     | streamlit, gradio, dearpygui, tkinter             | All run natively on macOS; choose per GUI style.                                  |
+| Numeric, utility            | numpy, requests                                   | Well-supported, native builds.               |
+
+---
+
+## 4. Special Notes for Apple M4 (ARM, Apple Silicon)
+
+- **PyTorch:** As of 2025, install via pip pre-built wheels for Apple Silicon, which leverage Apple's Metal backend for GPU acceleration.
+    - Example: `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/metal.html`
+- **diffusers:** Works well on CPU, but with torch+Metal, certain inpainting models will benefit from hardware acceleration.
+- **opencv-python:** Install via pip (`uv add opencv-python`); all core features work on ARM/macOS.
+- **easyocr:** Uses PyTorch, so installs and runs fine on Apple Silicon.
+- **No CUDA:** Apple M4 does not support CUDA; all GPU acceleration uses Metal, not NVIDIA CUDA.
+
+---
+
+## 5. uv Commands Example
+
+uv add pillow opencv-python easyocr diffusers torch torchvision transformers streamlit gradio dearpygui numpy requests
+> If you want a lighter GUI, you can skip DearPyGui and use only streamlit or gradio.
+
+---
+
+## 6. Tesseract OCR (Optional)
+If using **pytesseract**:
+- You must install the Tesseract binary for macOS first (e.g., via Homebrew: `brew install tesseract`).
+- Not as highly optimized as EasyOCR for Apple Silicon; use for simple OCR needs.
+
+---
+
+## 7. Additional Optimization Tips
+
+- **Large Models:** Download models (e.g., Stable Diffusion weights) with care; 16GB RAM is sufficient for single-image, but avoid very large batch inference.
+- **Environment Isolation:** Manage project strictly in your uv environment to prevent C dependency conflicts.
+- **Metal Acceleration:** Make sure torch and all AI dependencies are ARM64 builds; avoid x86-64 wheels/emulation.
+
+---
+
+**Summary:**  
+This tech stack is optimized for Apple M4 and macOS, ensures all AI/vision code runs natively, and enables you to use uv for clean, reproducible Python development.  
+For best performance and compatibility, use EasyOCR over PyTesseract and leverage PyTorch's Metal backend for AI computations.
--- a/WORKFLOW.md
+++ b/WORKFLOW.md
@ -0,0 +1,75 @@
+# Step-by-Step Document: Achieving Seamless Textbox Movement with Inpainted Background
+
+## Overview
+
+This guide details the five essential steps for replicating Prodifi-style smooth textbox movement—where textboxes or shape overlays can be freely moved, and the underlying image always appears natural, even if the original was flattened. Each step is modular and can be developed and tested independently.
+
+---
+
+## Step 1: Detect and Extract Overlay Objects
+
+- **Goal:** Identify all textboxes, balloon shapes, and overlay elements in the input image.
+- **Actions:**
+  - Use OCR (EasyOCR, PyTesseract) to detect text regions and their bounding boxes.
+  - Use shape detection (e.g., via OpenCV) to localize speech balloons or decorative overlays.
+  - Save metadata (position, size, text content, shape type) for each detected overlay.
+- **Result:** A list of overlay objects (textboxes, shapes), each with precise position and attributes.
+
+---
+
+## Step 2: Generate a Clean Background Image
+
+- **Goal:** Remove all detected overlays from the original image to produce a natural-looking, text-free background.
+- **Actions:**
+  - Create a composite mask covering all overlay object regions.
+  - Apply an AI inpainting model (e.g., Stable Diffusion Inpainting, LaMa) using this mask on the original image.
+  - Fine-tune mask boundaries for best inpainting results.
+- **Result:** A "clean" background image, as if the textboxes/shapes never existed.
+
+---
+
+## Step 3: Manage Overlays as Independent Objects
+
+- **Goal:** Store all extracted overlays as standalone interactive objects.
+- **Actions:**
+  - For each overlay, maintain data: position, size, text, font/style, visibility, z-order, etc.
+  - Ensure overlays are rendered above the clean background.
+  - Allow overlays to be moved, resized, modified in memory or via a GUI.
+- **Result:** True multi-layer editing; overlays behave like stickers or Photoshop layers.
+
+---
+
+## Step 4: Real-Time Interactive Editing
+
+- **Goal:** Provide a user interface for moving, resizing, and editing overlays smoothly.
+- **Actions:**
+  - Use a GUI toolkit (DearPyGui, PyQt, Streamlit, Gradio, etc.) to display image and overlay objects.
+  - Allow users to drag, resize, and edit overlays in real time.
+  - Moving an overlay simply updates its properties; the clean background naturally appears beneath it.
+  - No AI inpainting is required during live interactions.
+- **Result:** Fast, intuitive, and non-destructive user experience where any area revealed after moving overlays is always natural.
+
+---
+
+## Step 5: Commit Changes as Needed
+
+- **Goal:** Permanently merge or remove overlays from the project when required.
+- **Actions:**
+  - On user command (e.g., delete/flatten overlay), generate a precise mask for the affected region.
+  - Apply AI inpainting to fill the area on the clean background image if needed.
+  - Update the background only for committed destructives changes; retain reversibility for overlays.
+- **Result:** Background remains artifact-free after overlay deletions or merges, project state is consistent and professional.
+
+---
+
+## Summary Table
+
+| Step | Purpose                              | Core Tools/Actions                         | Outcome                                             |
+|------|--------------------------------------|--------------------------------------------|-----------------------------------------------------|
+| 1    | Detect overlays                      | OCR, shape detection                       | Overlay object list w/ metadata                     |
+| 2    | Clean background image                | AI inpainting, mask creation               | Text- and shape-free background image               |
+| 3    | Manage overlay objects                | Data structure for overlays                | Interactive, independent overlay layers             |
+| 4    | Real-time editing                     | GUI toolkit, canvas                        | Live, smooth movement and modification of overlays  |
+| 5    | Commit destructive changes            | Mask, AI inpainting (if needed)            | Background updates only on command/merge/delete     |
+
+---
--- a/data/clean_image.png
+++ b/data/clean_image.png
--- a/data/mask.png
+++ b/data/mask.png
--- a/data/overlays_metadata.json
+++ b/data/overlays_metadata.json
@ -0,0 +1,74 @@
+[
+  {
+    "type": "textbox",
+    "position": [
+      100,
+      5
+    ],
+    "size": [
+      145,
+      104
+    ],
+    "confidence": 0.9653024673461914
+  },
+  {
+    "type": "textbox",
+    "position": [
+      62,
+      343
+    ],
+    "size": [
+      129,
+      97
+    ],
+    "confidence": 0.9590029716491699
+  },
+  {
+    "type": "textbox",
+    "position": [
+      434,
+      8
+    ],
+    "size": [
+      169,
+      86
+    ],
+    "confidence": 0.9560245871543884
+  },
+  {
+    "type": "textbox",
+    "position": [
+      393,
+      323
+    ],
+    "size": [
+      212,
+      98
+    ],
+    "confidence": 0.8496555685997009
+  },
+  {
+    "type": "textbox",
+    "position": [
+      391,
+      322
+    ],
+    "size": [
+      231,
+      158
+    ],
+    "confidence": 0.8047636151313782
+  },
+  {
+    "type": "textbox",
+    "position": [
+      467,
+      323
+    ],
+    "size": [
+      165,
+      157
+    ],
+    "confidence": 0.4736592769622803
+  }
+]
--- a/data/overlays_visualized.jpg
+++ b/data/overlays_visualized.jpg
--- a/data/test.jpg
+++ b/data/test.jpg
--- a/detect_overlays.py
+++ b/detect_overlays.py
@ -0,0 +1,61 @@
+import cv2
+import numpy as np
+from ultralytics import YOLO
+import json
+
+IMAGE_PATH = 'data/test.jpg'
+OUTPUT_METADATA = 'data/overlays_metadata.json'
+OUTPUT_IMAGE = 'data/overlays_visualized.jpg'
+OUTPUT_MASK = 'data/mask.png'
+
+# Load image
+image = cv2.imread(IMAGE_PATH)
+if image is None:
+    raise FileNotFoundError(f"Không tìm thấy ảnh: {IMAGE_PATH}")
+
+# get the size of the image 
+h, w = image.shape[:2]
+
+# create mask 
+mask = np.zeros((h, w), dtype=np.uint8)
+
+# Load YOLOv8 model 
+model = YOLO('model/yolo8_seg-speed-bubble.pt')
+
+# Run detection
+results = model(IMAGE_PATH)
+
+overlays = []
+
+for result in results:
+    # detect speech bubbles
+    boxes = result.boxes
+    for box in boxes:
+        x1, y1, x2, y2 = [int(coordinate) for coordinate in box.xyxy[0]]
+        conf = float(box.conf[0])
+        # save metadata
+        overlays.append({
+            "type": "textbox",
+            "position": [x1, y1],
+            "size": [x2 - x1, y2 - y1],
+            "confidence": conf
+        })
+        # draw bounding box (BGR: 255,0,0)
+        cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
+
+    # create mask
+    if hasattr(result, 'masks') and result.masks is not None:
+        for m in result.masks.data:
+            m = m.cpu().numpy().astype(np.uint8) * 255
+            # resize mask to image size
+            m_resized = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST)
+            mask = cv2.bitwise_or(mask, m_resized)
+    
+
+# metadata
+with open(OUTPUT_METADATA, "w") as f:
+    json.dump(overlays, f, indent=2)
+
+# save image
+cv2.imwrite(OUTPUT_IMAGE, image)
+cv2.imwrite(OUTPUT_MASK, mask)
--- a/main.py
+++ b/main.py
@ -0,0 +1,6 @@
+def main():
+    print("Hello from profive!")
+
+
+if __name__ == "__main__":
+    main()
--- a/model/best.pt
+++ b/model/best.pt
--- a/model/last.pt
+++ b/model/last.pt
--- a/model/model.pt
+++ b/model/model.pt
--- a/model/yolo8_seg-speed-bubble.pt
+++ b/model/yolo8_seg-speed-bubble.pt
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,25 @@
+[project]
+name = "profive"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "accelerate>=1.9.0",
+    "diffusers>=0.34.0",
+    "easyocr>=1.7.2",
+    "gradio>=5.38.0",
+    "numpy>=2.3.1",
+    "opencv-python>=4.11.0.86",
+    "pillow>=11.3.0",
+    "pyaml>=25.7.0",
+    "requests>=2.32.4",
+    "scikit-image>=0.25.2",
+    "scikit-learn>=1.7.1",
+    "scipy>=1.16.0",
+    "streamlit>=1.47.0",
+    "torch>=2.7.1",
+    "torchvision>=0.22.1",
+    "transformers>=4.53.2",
+    "ultralytics>=8.3.169",
+]
--- a/test.py
+++ b/test.py
@ -0,0 +1,24 @@
+import cv2
+import numpy as np
+from ultralytics import YOLO
+
+IMAGE_PATH = 'data/test.jpg'
+OUTPUT_MASK = 'data/bubble_mask.png'
+
+image = cv2.imread(IMAGE_PATH)
+h, w = image.shape[:2]
+mask = np.zeros((h, w), dtype=np.uint8)
+
+model = YOLO('model/yolo8_seg-speed-bubble.pt')
+results = model(IMAGE_PATH)
+
+for result in results:
+    if hasattr(result, 'masks') and result.masks is not None:
+        for m in result.masks.data:
+            m = m.cpu().numpy().astype(np.uint8) * 255
+            # resize mask to image size
+            m_resized = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST)
+            mask = cv2.bitwise_or(mask, m_resized)
+
+cv2.imwrite(OUTPUT_MASK, mask)
+print(f"Đã lưu mask tại {OUTPUT_MASK}")
--- a/uv.lock
+++ b/uv.lock