Merge pull request 'dev/viettran' (#2) from dev/viettran into main

Reviewed-on: #2
2025-08-21 08:48:57 +00:00 · 2025-08-21 08:48:57 +00:00 · c946d3c871
commit c946d3c871
parent e5771c6e8a f561d702d1
6 changed files with 359 additions and 169 deletions
--- a/data/Compare
+++ b/data/Compare
--- a/data_comparator.py
+++ b/data_comparator.py
@ -3,7 +3,25 @@ import numpy as np
 from typing import Dict, List, Tuple, Any
 from dataclasses import dataclass
-@dataclass
+def normalize_episode(episode: str) -> str:
    """Normalize episode numbers to handle cases like '54' vs '54.0'"""
    if not episode or episode.strip() == '':
        return episode
    try:
        # Convert to float first to handle both int and float formats
        episode_float = float(episode.strip())
        # If it's a whole number (like 54.0), convert to int format
        if episode_float.is_integer():
            return str(int(episode_float))
        else:
            # Keep decimal format for non-whole numbers
            return str(episode_float)
    except (ValueError, TypeError):
        # If conversion fails, return original episode string
        return episode.strip()
 class ComparisonItem:
    """Represents a single item for comparison"""
    title: str
@ -11,6 +29,12 @@ class ComparisonItem:
    source_sheet: str
    row_index: int
    def __init__(self, title: str, episode: str, source_sheet: str, row_index: int):
        self.title = title
        self.episode = normalize_episode(episode)  # Normalize episode on creation
        self.source_sheet = source_sheet
        self.row_index = row_index
    def __hash__(self):
        return hash((self.title, self.episode))
@ -167,15 +191,23 @@ class KSTCoordiComparator:
        kst_all_items = sheet_data['kst_all_items']
        coordi_all_items = sheet_data['coordi_all_items']
-        # Find overlaps and differences
+        # Find duplicates within each dataset first
        matched_items = kst_items.intersection(coordi_items)
        kst_only_items = kst_items - coordi_items
        coordi_only_items = coordi_items - kst_items
        # Find duplicates within each dataset - FIXED LOGIC
        kst_duplicates = self._find_duplicates_in_list(kst_all_items)
        coordi_duplicates = self._find_duplicates_in_list(coordi_all_items)
        # Create sets of items that have duplicates (to exclude from "only" lists)
        kst_duplicate_keys = {(item.title, item.episode) for item in kst_duplicates}
        coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_duplicates}
        # Find overlaps and differences - exclude items that have duplicates
        matched_items = kst_items.intersection(coordi_items)
        # For "only" items: exclude those that have duplicates within their own dataset
        kst_only_items = {item for item in kst_items - coordi_items 
                         if (item.title, item.episode) not in kst_duplicate_keys}
        coordi_only_items = {item for item in coordi_items - kst_items 
                            if (item.title, item.episode) not in coordi_duplicate_keys}
        categorization = {
            'matched_items': list(matched_items),
            'kst_only_items': list(kst_only_items),
@ -245,10 +277,16 @@ class KSTCoordiComparator:
        kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates}
        coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates}
        # Count actual instances for each item
        from collections import Counter
        kst_counts = Counter((item.title, item.episode) for item in kst_sheet_items)
        coordi_counts = Counter((item.title, item.episode) for item in coordi_sheet_items)
        # Find matched items that also have duplicates within the same sheet
        for title, episode in matched_in_sheet:
            # Check if this matched item has duplicates in KST within this sheet
            if (title, episode) in kst_duplicate_keys:
                kst_count = kst_counts[(title, episode)]
                mixed_duplicates.append({
                    'title': title,
                    'episode': episode,
@ -256,11 +294,13 @@ class KSTCoordiComparator:
                    'row_index': None,  # Could get from items if needed
                    'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}',
                    'mismatch_type': 'MIXED_DUPLICATE_KST',
-                    'duplicate_side': 'KST'
+                    'duplicate_side': 'KST',
                    'duplicate_count': kst_count
                })
            # Check if this matched item has duplicates in Coordi within this sheet
            if (title, episode) in coordi_duplicate_keys:
                coordi_count = coordi_counts[(title, episode)]
                mixed_duplicates.append({
                    'title': title,
                    'episode': episode,
@ -268,7 +308,8 @@ class KSTCoordiComparator:
                    'row_index': None,  # Could get from items if needed
                    'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}',
                    'mismatch_type': 'MIXED_DUPLICATE_COORDI',
-                    'duplicate_side': 'COORDI'
+                    'duplicate_side': 'COORDI',
                    'duplicate_count': coordi_count
                })
        return mixed_duplicates
@ -340,7 +381,7 @@ class KSTCoordiComparator:
        return mismatch_details
-    def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]:
+    def get_comparison_summary(self, sheet_filter: str | None = None) -> Dict[str, Any]:
        """Get a comprehensive summary of the comparison for a specific sheet only"""
        # Get sheet names for filtering options
        sheet_names = list(self.data.keys()) if self.data else []
@ -467,9 +508,146 @@ class KSTCoordiComparator:
        return grouped
    def generate_visualize_data(self, sheet_filter: str | None = None) -> List[Dict[str, Any]]:
        """Generate data structure for Excel-like visualization"""
        # Get comparison data for the specified sheet
        summary = self.get_comparison_summary(sheet_filter)
        mismatch_details = summary['mismatch_details']
        visualize_rows = []
        # Helper function to create a row
        def create_row(coordi_title="", coordi_chapter="", kst_title="", kst_chapter="", 
                      row_type="matched", reason="", title_for_sort=""):
            return {
                'coordi_title': coordi_title,
                'coordi_chapter': coordi_chapter,
                'kst_title': kst_title,
                'kst_chapter': kst_chapter,
                'row_type': row_type,
                'reason': reason,
                'title_for_sort': title_for_sort or coordi_title or kst_title,
                'priority': 1 if row_type != 'matched' else 2  # Mismatches first
            }
        # 1. Handle Coordi-only items
        for item in mismatch_details['coordi_only']:
            visualize_rows.append(create_row(
                coordi_title=item['title'],
                coordi_chapter=item['episode'],
                row_type='coordi_only',
                reason='Only in Coordi'
            ))
        # 2. Handle KST-only items  
        for item in mismatch_details['kst_only']:
            visualize_rows.append(create_row(
                kst_title=item['title'],
                kst_chapter=item['episode'],
                row_type='kst_only',
                reason='Only in KST'
            ))
        # 3. Handle Mixed duplicates (exists in both but duplicated on one side)
        mixed_items = {}  # Group by title+episode
        for item in mismatch_details['mixed_duplicates']:
            key = f"{item['title']}_{item['episode']}"
            if key not in mixed_items:
                mixed_items[key] = {
                    'title': item['title'],
                    'episode': item['episode'],
                    'kst_duplicate_count': 0,
                    'coordi_duplicate_count': 0
                }
            # Count the actual duplicates for each side
            if item['duplicate_side'] == 'KST':
                mixed_items[key]['kst_duplicate_count'] = item.get('duplicate_count', 1)
            elif item['duplicate_side'] == 'COORDI':
                mixed_items[key]['coordi_duplicate_count'] = item.get('duplicate_count', 1)
        for key, item in mixed_items.items():
            # First row: show it exists in both
            visualize_rows.append(create_row(
                coordi_title=item['title'],
                coordi_chapter=item['episode'],
                kst_title=item['title'],
                kst_chapter=item['episode'],
                row_type='mixed_duplicate',
                reason='Mixed duplicate'
            ))
            # Additional rows for KST duplicates (count - 1 since first is already shown)
            for i in range(max(0, item['kst_duplicate_count'] - 1)):
                visualize_rows.append(create_row(
                    kst_title=item['title'],
                    kst_chapter=item['episode'],
                    row_type='mixed_duplicate',
                    reason='Duplicate in KST',
                    title_for_sort=item['title']
                ))
            # Additional rows for Coordi duplicates (count - 1 since first is already shown)
            for i in range(max(0, item['coordi_duplicate_count'] - 1)):
                visualize_rows.append(create_row(
                    coordi_title=item['title'],
                    coordi_chapter=item['episode'],
                    row_type='mixed_duplicate', 
                    reason='Duplicate in Coordi',
                    title_for_sort=item['title']
                ))
        # 4. Handle Pure duplicates
        for item in mismatch_details['kst_duplicates']:
            visualize_rows.append(create_row(
                kst_title=item['title'],
                kst_chapter=item['episode'],
                row_type='pure_duplicate',
                reason='Duplicate in KST'
            ))
        for item in mismatch_details['coordi_duplicates']:
            visualize_rows.append(create_row(
                coordi_title=item['title'],
                coordi_chapter=item['episode'],
                row_type='pure_duplicate',
                reason='Duplicate in Coordi'
            ))
        # 5. Handle Matched items (perfect matches)
        matched_by_title = summary['grouped_by_title']['matched_by_title']
        for title, items in matched_by_title.items():
            for item in items:
                visualize_rows.append(create_row(
                    coordi_title=item['title'],
                    coordi_chapter=item['episode'],
                    kst_title=item['title'],
                    kst_chapter=item['episode'],
                    row_type='matched',
                    reason='Perfect match'
                ))
        # Sort: Mismatches first (priority 1), then matches (priority 2), then by Korean title + chapter
        def sort_key(x):
            # Extract episode number for proper numeric sorting
            coordi_episode = x.get('coordi_chapter', '') or ''
            kst_episode = x.get('kst_chapter', '') or ''
            episode = coordi_episode or kst_episode
            # Try to convert episode to number for proper sorting, fallback to string
            try:
                episode_num = float(episode) if episode else 0
            except (ValueError, TypeError):
                episode_num = 0
            return (x['priority'], x['title_for_sort'], episode_num)
        visualize_rows.sort(key=sort_key)
        return visualize_rows
-    def print_comparison_summary(self, sheet_filter: str = None):
+    def print_comparison_summary(self, sheet_filter: str | None = None):
        """Print a formatted summary of the comparison for a specific sheet"""
        summary = self.get_comparison_summary(sheet_filter)
--- a/templates/index.html
+++ b/templates/index.html
@ -173,6 +173,32 @@
            border: 1px solid #ddd;
            border-radius: 4px;
        }
        /* Vibrant color styles for Visualize tab */
        .coordi-only-row {
            background-color: #ff4444 !important; /* Bright red */
            color: white;
        }
        .kst-only-row {
            background-color: #4488ff !important; /* Bright blue */
            color: white;
        }
        .mixed-duplicate-row {
            background-color: #ff8800 !important; /* Bright orange */
            color: white;
        }
        .pure-duplicate-row {
            background-color: #8844ff !important; /* Bright purple */
            color: white;
        }
        .matched-row {
            background-color: white !important; /* White background */
            color: black;
        }
    </style>
 </head>
 <body>
@ -203,6 +229,7 @@
            <div class="tabs">
                <div class="tab active" onclick="showTab('summary')">Summary</div>
                <div class="tab" onclick="showTab('different')">Different</div>
                <div class="tab" onclick="showTab('visualize')">Visualize</div>
            </div>
            <div id="summary" class="tab-content active">
@ -255,6 +282,25 @@
                    </table>
                </div>
            </div>
            <div id="visualize" class="tab-content">
                <h3>Data </h3>
                <div class="table-container">
                    <table id="visualize-table">
                        <thead>
                            <tr>
                                <th>Coordi Title</th>
                                <th>Coordi Chapter</th>
                                <th>KST Title</th>
                                <th>KST Chapter</th>
                                <th>Status</th>
                            </tr>
                        </thead>
                        <tbody id="visualize-table-body">
                        </tbody>
                    </table>
                </div>
            </div>
        </div>
    </div>
@ -468,6 +514,9 @@
            // Update Different tab
            updateDifferentTable(results.mismatch_details);
            // Update Visualize tab
            updateVisualizeTable(results.visualize_data);
        }
        function updateSummaryTable(matchedData) {
@ -587,6 +636,40 @@
            });
        }
        function updateVisualizeTable(visualizeData) {
            const tbody = document.getElementById('visualize-table-body');
            tbody.innerHTML = '';
            // Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
            visualizeData.forEach(row => {
                const tr = tbody.insertRow();
                tr.insertCell(0).textContent = row.coordi_title || '';
                tr.insertCell(1).textContent = row.coordi_chapter || '';
                tr.insertCell(2).textContent = row.kst_title || '';
                tr.insertCell(3).textContent = row.kst_chapter || '';
                tr.insertCell(4).textContent = row.reason || '';
                // Apply vibrant color highlighting based on row type
                switch (row.row_type) {
                    case 'coordi_only':
                        tr.className = 'coordi-only-row';
                        break;
                    case 'kst_only':
                        tr.className = 'kst-only-row';
                        break;
                    case 'mixed_duplicate':
                        tr.className = 'mixed-duplicate-row';
                        break;
                    case 'pure_duplicate':
                        tr.className = 'pure-duplicate-row';
                        break;
                    case 'matched':
                        tr.className = 'matched-row';
                        break;
                }
            });
        }
        // Auto-analyze on page load with default file
        window.onload = function() {
            // Initialize sheet filter with loading state
--- a/test_ba_confirmed_cases.py
+++ b/test_ba_confirmed_cases.py
@ -1,101 +0,0 @@
 #!/usr/bin/env python3
 from data_comparator import KSTCoordiComparator
 def test_ba_confirmed_cases():
    """Test that the comparison logic matches BA confirmed expectations"""
    print("Testing BA confirmed duplicate cases...")
    # Create comparator and load data
    comparator = KSTCoordiComparator("data/sample-data.xlsx")
    if not comparator.load_data():
        print("Failed to load data!")
        return
    print("\n=== US URGENT Sheet - BA Confirmed Cases ===")
    us_summary = comparator.get_comparison_summary('US URGENT')
    # Check for expected duplicates in US URGENT
    coordi_duplicates = us_summary['mismatch_details']['coordi_duplicates']
    mixed_duplicates = us_summary['mismatch_details']['mixed_duplicates']
    expected_coordi_duplicates = [
        ('금수의 영역', '17'),
        ('신결', '23')
    ]
    expected_mixed_duplicates = [
        ('트윈 가이드', '31')
    ]
    print("Coordi duplicates found:")
    found_coordi = []
    for item in coordi_duplicates:
        key = (item['title'], item['episode'])
        found_coordi.append(key)
        print(f"  - {item['title']} - Episode {item['episode']}")
    print("\nMixed duplicates found:")
    found_mixed = []
    for item in mixed_duplicates:
        key = (item['title'], item['episode'])
        found_mixed.append(key)
        print(f"  - {item['title']} - Episode {item['episode']} ({item['reason']})")
    # Verify expected cases
    print("\n✓ Verification:")
    for expected in expected_coordi_duplicates:
        if expected in found_coordi:
            print(f"  ✓ Found expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
        else:
            print(f"  ✗ Missing expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
    for expected in expected_mixed_duplicates:
        if expected in found_mixed:
            print(f"  ✓ Found expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
        else:
            print(f"  ✗ Missing expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
    print("\n=== TH URGENT Sheet - BA Confirmed Cases ===")
    th_summary = comparator.get_comparison_summary('TH URGENT')
    # Check for expected duplicates in TH URGENT
    kst_duplicates = th_summary['mismatch_details']['kst_duplicates']
    coordi_only = th_summary['mismatch_details']['coordi_only']
    expected_kst_duplicates = [
        ('백라이트', '53-1x(휴재)')
    ]
    print("KST duplicates found:")
    found_kst = []
    for item in kst_duplicates:
        key = (item['title'], item['episode'])
        found_kst.append(key)
        print(f"  - {item['title']} - Episode {item['episode']}")
    # Check that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi
    print("\nChecking that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi:")
    found_in_coordi = False
    for item in coordi_only:
        if item['title'] == '백라이트' and item['episode'] == '53-1x(휴재)':
            found_in_coordi = True
            break
    if not found_in_coordi:
        print("  ✓ 백라이트 - Episode 53-1x(휴재) correctly does NOT appear in Coordi data")
    else:
        print("  ✗ 백라이트 - Episode 53-1x(휴재) incorrectly appears in Coordi data")
    # Verify expected cases
    print("\n✓ Verification:")
    for expected in expected_kst_duplicates:
        if expected in found_kst:
            print(f"  ✓ Found expected KST duplicate: {expected[0]} - Episode {expected[1]}")
        else:
            print(f"  ✗ Missing expected KST duplicate: {expected[0]} - Episode {expected[1]}")
    print("\n✓ All BA confirmed cases tested!")
 if __name__ == "__main__":
    test_ba_confirmed_cases()
--- a/test_sheet_filtering.py
+++ b/test_sheet_filtering.py
@ -1,57 +0,0 @@
 #!/usr/bin/env python3
 from data_comparator import KSTCoordiComparator
 def test_sheet_filtering():
    """Test that sheet filtering works correctly and defaults to first sheet"""
    print("Testing sheet filtering functionality...")
    # Create comparator and load data
    comparator = KSTCoordiComparator("data/sample-data.xlsx")
    if not comparator.load_data():
        print("Failed to load data!")
        return
    print(f"Available sheets: {list(comparator.data.keys())}")
    # Test 1: No sheet filter provided (should default to first sheet)
    print("\n=== TEST 1: No sheet filter (should default to first sheet) ===")
    try:
        summary1 = comparator.get_comparison_summary()
        print(f"Default sheet selected: {summary1['current_sheet_filter']}")
        print(f"KST total: {summary1['original_counts']['kst_total']}")
        print(f"Coordi total: {summary1['original_counts']['coordi_total']}")
        print(f"Matched: {summary1['matched_items_count']}")
        print("✓ Test 1 passed")
    except Exception as e:
        print(f"✗ Test 1 failed: {e}")
    # Test 2: Specific sheet filter
    sheet_names = list(comparator.data.keys())
    if len(sheet_names) > 1:
        second_sheet = sheet_names[1]
        print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===")
        try:
            summary2 = comparator.get_comparison_summary(second_sheet)
            print(f"Selected sheet: {summary2['current_sheet_filter']}")
            print(f"KST total: {summary2['original_counts']['kst_total']}")
            print(f"Coordi total: {summary2['original_counts']['coordi_total']}")
            print(f"Matched: {summary2['matched_items_count']}")
            print("✓ Test 2 passed")
        except Exception as e:
            print(f"✗ Test 2 failed: {e}")
    else:
        print("\n=== TEST 2: Skipped (only one sheet available) ===")
    # Test 3: Verify no duplicates across sheets (this was the original problem)
    print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===")
    for sheet_name in sheet_names:
        summary = comparator.get_comparison_summary(sheet_name)
        print(f"Sheet '{sheet_name}':")
        print(f"  KST duplicates: {summary['mismatches']['kst_duplicates_count']}")
        print(f"  Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}")
    print("\n✓ All tests completed!")
 if __name__ == "__main__":
    test_sheet_filtering()
--- a/web_gui.py
+++ b/web_gui.py
@ -61,6 +61,10 @@ def analyze_data():
        # Add matched data to results
        comparison_results['matched_data'] = matched_items_data
        # Generate visualize data
        visualize_data = comparator_instance.generate_visualize_data(sheet_filter)
        comparison_results['visualize_data'] = visualize_data
        return jsonify({
            'success': True,
            'results': comparison_results
@ -307,6 +311,32 @@ def create_templates_dir():
            border: 1px solid #ddd;
            border-radius: 4px;
        }
        /* Vibrant color styles for Visualize tab */
        .coordi-only-row {
            background-color: #ff4444 !important; /* Bright red */
            color: white;
        }
        .kst-only-row {
            background-color: #4488ff !important; /* Bright blue */
            color: white;
        }
        .mixed-duplicate-row {
            background-color: #ff8800 !important; /* Bright orange */
            color: white;
        }
        .pure-duplicate-row {
            background-color: #8844ff !important; /* Bright purple */
            color: white;
        }
        .matched-row {
            background-color: white !important; /* White background */
            color: black;
        }
    </style>
 </head>
 <body>
@ -337,6 +367,7 @@ def create_templates_dir():
            <div class="tabs">
                <div class="tab active" onclick="showTab('summary')">Summary</div>
                <div class="tab" onclick="showTab('different')">Different</div>
                <div class="tab" onclick="showTab('visualize')">Visualize</div>
            </div>
            <div id="summary" class="tab-content active">
@ -389,6 +420,25 @@ def create_templates_dir():
                    </table>
                </div>
            </div>
            <div id="visualize" class="tab-content">
                <h3>Data </h3>
                <div class="table-container">
                    <table id="visualize-table">
                        <thead>
                            <tr>
                                <th>Coordi Title</th>
                                <th>Coordi Chapter</th>
                                <th>KST Title</th>
                                <th>KST Chapter</th>
                                <th>Status</th>
                            </tr>
                        </thead>
                        <tbody id="visualize-table-body">
                        </tbody>
                    </table>
                </div>
            </div>
        </div>
    </div>
@ -602,6 +652,9 @@ def create_templates_dir():
            // Update Different tab
            updateDifferentTable(results.mismatch_details);
            // Update Visualize tab
            updateVisualizeTable(results.visualize_data);
        }
        function updateSummaryTable(matchedData) {
@ -721,6 +774,40 @@ def create_templates_dir():
            });
        }
        function updateVisualizeTable(visualizeData) {
            const tbody = document.getElementById('visualize-table-body');
            tbody.innerHTML = '';
            // Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
            visualizeData.forEach(row => {
                const tr = tbody.insertRow();
                tr.insertCell(0).textContent = row.coordi_title || '';
                tr.insertCell(1).textContent = row.coordi_chapter || '';
                tr.insertCell(2).textContent = row.kst_title || '';
                tr.insertCell(3).textContent = row.kst_chapter || '';
                tr.insertCell(4).textContent = row.reason || '';
                // Apply vibrant color highlighting based on row type
                switch (row.row_type) {
                    case 'coordi_only':
                        tr.className = 'coordi-only-row';
                        break;
                    case 'kst_only':
                        tr.className = 'kst-only-row';
                        break;
                    case 'mixed_duplicate':
                        tr.className = 'mixed-duplicate-row';
                        break;
                    case 'pure_duplicate':
                        tr.className = 'pure-duplicate-row';
                        break;
                    case 'matched':
                        tr.className = 'matched-row';
                        break;
                }
            });
        }
        // Auto-analyze on page load with default file
        window.onload = function() {
            // Initialize sheet filter with loading state