add logic for multiple duplication

2025-08-21 15:48:40 +07:00 · 2025-08-21 15:48:40 +07:00 · f561d702d1
commit f561d702d1
parent 8d0351622b
4 changed files with 34 additions and 173 deletions
--- a/data/Compare
+++ b/data/Compare
--- a/data_comparator.py
+++ b/data_comparator.py
@ -191,15 +191,23 @@ class KSTCoordiComparator:
        kst_all_items = sheet_data['kst_all_items']
        coordi_all_items = sheet_data['coordi_all_items']
        
-        # Find overlaps and differences
-        matched_items = kst_items.intersection(coordi_items)
-        kst_only_items = kst_items - coordi_items
-        coordi_only_items = coordi_items - kst_items
-        
-        # Find duplicates within each dataset - FIXED LOGIC
+        # Find duplicates within each dataset first
        kst_duplicates = self._find_duplicates_in_list(kst_all_items)
        coordi_duplicates = self._find_duplicates_in_list(coordi_all_items)
        
+        # Create sets of items that have duplicates (to exclude from "only" lists)
+        kst_duplicate_keys = {(item.title, item.episode) for item in kst_duplicates}
+        coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_duplicates}
+        
+        # Find overlaps and differences - exclude items that have duplicates
+        matched_items = kst_items.intersection(coordi_items)
+        
+        # For "only" items: exclude those that have duplicates within their own dataset
+        kst_only_items = {item for item in kst_items - coordi_items 
+                         if (item.title, item.episode) not in kst_duplicate_keys}
+        coordi_only_items = {item for item in coordi_items - kst_items 
+                            if (item.title, item.episode) not in coordi_duplicate_keys}
+        
        categorization = {
            'matched_items': list(matched_items),
            'kst_only_items': list(kst_only_items),
@ -269,10 +277,16 @@ class KSTCoordiComparator:
        kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates}
        coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates}
        
+        # Count actual instances for each item
+        from collections import Counter
+        kst_counts = Counter((item.title, item.episode) for item in kst_sheet_items)
+        coordi_counts = Counter((item.title, item.episode) for item in coordi_sheet_items)
+        
        # Find matched items that also have duplicates within the same sheet
        for title, episode in matched_in_sheet:
            # Check if this matched item has duplicates in KST within this sheet
            if (title, episode) in kst_duplicate_keys:
+                kst_count = kst_counts[(title, episode)]
                mixed_duplicates.append({
                    'title': title,
                    'episode': episode,
@ -280,11 +294,13 @@ class KSTCoordiComparator:
                    'row_index': None,  # Could get from items if needed
                    'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}',
                    'mismatch_type': 'MIXED_DUPLICATE_KST',
-                    'duplicate_side': 'KST'
+                    'duplicate_side': 'KST',
+                    'duplicate_count': kst_count
                })
            
            # Check if this matched item has duplicates in Coordi within this sheet
            if (title, episode) in coordi_duplicate_keys:
+                coordi_count = coordi_counts[(title, episode)]
                mixed_duplicates.append({
                    'title': title,
                    'episode': episode,
@ -292,7 +308,8 @@ class KSTCoordiComparator:
                    'row_index': None,  # Could get from items if needed
                    'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}',
                    'mismatch_type': 'MIXED_DUPLICATE_COORDI',
-                    'duplicate_side': 'COORDI'
+                    'duplicate_side': 'COORDI',
+                    'duplicate_count': coordi_count
                })
        
        return mixed_duplicates
@ -539,14 +556,15 @@ class KSTCoordiComparator:
                mixed_items[key] = {
                    'title': item['title'],
                    'episode': item['episode'],
-                    'has_kst_duplicate': False,
-                    'has_coordi_duplicate': False
+                    'kst_duplicate_count': 0,
+                    'coordi_duplicate_count': 0
                }
            
+            # Count the actual duplicates for each side
            if item['duplicate_side'] == 'KST':
-                mixed_items[key]['has_kst_duplicate'] = True
+                mixed_items[key]['kst_duplicate_count'] = item.get('duplicate_count', 1)
            elif item['duplicate_side'] == 'COORDI':
-                mixed_items[key]['has_coordi_duplicate'] = True
+                mixed_items[key]['coordi_duplicate_count'] = item.get('duplicate_count', 1)
        
        for key, item in mixed_items.items():
            # First row: show it exists in both
@ -559,8 +577,8 @@ class KSTCoordiComparator:
                reason='Mixed duplicate'
            ))
            
-            # Additional rows for duplicates
-            if item['has_kst_duplicate']:
+            # Additional rows for KST duplicates (count - 1 since first is already shown)
+            for i in range(max(0, item['kst_duplicate_count'] - 1)):
                visualize_rows.append(create_row(
                    kst_title=item['title'],
                    kst_chapter=item['episode'],
@ -569,7 +587,8 @@ class KSTCoordiComparator:
                    title_for_sort=item['title']
                ))
            
-            if item['has_coordi_duplicate']:
+            # Additional rows for Coordi duplicates (count - 1 since first is already shown)
+            for i in range(max(0, item['coordi_duplicate_count'] - 1)):
                visualize_rows.append(create_row(
                    coordi_title=item['title'],
                    coordi_chapter=item['episode'],
--- a/test_ba_confirmed_cases.py
+++ b/test_ba_confirmed_cases.py
@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-
-from data_comparator import KSTCoordiComparator
-
-def test_ba_confirmed_cases():
-    """Test that the comparison logic matches BA confirmed expectations"""
-    print("Testing BA confirmed duplicate cases...")
-    
-    # Create comparator and load data
-    comparator = KSTCoordiComparator("data/sample-data.xlsx")
-    if not comparator.load_data():
-        print("Failed to load data!")
-        return
-    
-    print("\n=== US URGENT Sheet - BA Confirmed Cases ===")
-    us_summary = comparator.get_comparison_summary('US URGENT')
-    
-    # Check for expected duplicates in US URGENT
-    coordi_duplicates = us_summary['mismatch_details']['coordi_duplicates']
-    mixed_duplicates = us_summary['mismatch_details']['mixed_duplicates']
-    
-    expected_coordi_duplicates = [
-        ('금수의 영역', '17'),
-        ('신결', '23')
-    ]
-    
-    expected_mixed_duplicates = [
-        ('트윈 가이드', '31')
-    ]
-    
-    print("Coordi duplicates found:")
-    found_coordi = []
-    for item in coordi_duplicates:
-        key = (item['title'], item['episode'])
-        found_coordi.append(key)
-        print(f"  - {item['title']} - Episode {item['episode']}")
-    
-    print("\nMixed duplicates found:")
-    found_mixed = []
-    for item in mixed_duplicates:
-        key = (item['title'], item['episode'])
-        found_mixed.append(key)
-        print(f"  - {item['title']} - Episode {item['episode']} ({item['reason']})")
-    
-    # Verify expected cases
-    print("\n✓ Verification:")
-    for expected in expected_coordi_duplicates:
-        if expected in found_coordi:
-            print(f"  ✓ Found expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
-        else:
-            print(f"  ✗ Missing expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
-    
-    for expected in expected_mixed_duplicates:
-        if expected in found_mixed:
-            print(f"  ✓ Found expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
-        else:
-            print(f"  ✗ Missing expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
-    
-    print("\n=== TH URGENT Sheet - BA Confirmed Cases ===")
-    th_summary = comparator.get_comparison_summary('TH URGENT')
-    
-    # Check for expected duplicates in TH URGENT
-    kst_duplicates = th_summary['mismatch_details']['kst_duplicates']
-    coordi_only = th_summary['mismatch_details']['coordi_only']
-    
-    expected_kst_duplicates = [
-        ('백라이트', '53-1x(휴재)')
-    ]
-    
-    print("KST duplicates found:")
-    found_kst = []
-    for item in kst_duplicates:
-        key = (item['title'], item['episode'])
-        found_kst.append(key)
-        print(f"  - {item['title']} - Episode {item['episode']}")
-    
-    # Check that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi
-    print("\nChecking that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi:")
-    found_in_coordi = False
-    for item in coordi_only:
-        if item['title'] == '백라이트' and item['episode'] == '53-1x(휴재)':
-            found_in_coordi = True
-            break
-    
-    if not found_in_coordi:
-        print("  ✓ 백라이트 - Episode 53-1x(휴재) correctly does NOT appear in Coordi data")
-    else:
-        print("  ✗ 백라이트 - Episode 53-1x(휴재) incorrectly appears in Coordi data")
-    
-    # Verify expected cases
-    print("\n✓ Verification:")
-    for expected in expected_kst_duplicates:
-        if expected in found_kst:
-            print(f"  ✓ Found expected KST duplicate: {expected[0]} - Episode {expected[1]}")
-        else:
-            print(f"  ✗ Missing expected KST duplicate: {expected[0]} - Episode {expected[1]}")
-    
-    print("\n✓ All BA confirmed cases tested!")
-
-if __name__ == "__main__":
-    test_ba_confirmed_cases()
--- a/test_sheet_filtering.py
+++ b/test_sheet_filtering.py
@ -1,57 +0,0 @@
-#!/usr/bin/env python3
-
-from data_comparator import KSTCoordiComparator
-
-def test_sheet_filtering():
-    """Test that sheet filtering works correctly and defaults to first sheet"""
-    print("Testing sheet filtering functionality...")
-    
-    # Create comparator and load data
-    comparator = KSTCoordiComparator("data/sample-data.xlsx")
-    if not comparator.load_data():
-        print("Failed to load data!")
-        return
-    
-    print(f"Available sheets: {list(comparator.data.keys())}")
-    
-    # Test 1: No sheet filter provided (should default to first sheet)
-    print("\n=== TEST 1: No sheet filter (should default to first sheet) ===")
-    try:
-        summary1 = comparator.get_comparison_summary()
-        print(f"Default sheet selected: {summary1['current_sheet_filter']}")
-        print(f"KST total: {summary1['original_counts']['kst_total']}")
-        print(f"Coordi total: {summary1['original_counts']['coordi_total']}")
-        print(f"Matched: {summary1['matched_items_count']}")
-        print("✓ Test 1 passed")
-    except Exception as e:
-        print(f"✗ Test 1 failed: {e}")
-    
-    # Test 2: Specific sheet filter
-    sheet_names = list(comparator.data.keys())
-    if len(sheet_names) > 1:
-        second_sheet = sheet_names[1]
-        print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===")
-        try:
-            summary2 = comparator.get_comparison_summary(second_sheet)
-            print(f"Selected sheet: {summary2['current_sheet_filter']}")
-            print(f"KST total: {summary2['original_counts']['kst_total']}")
-            print(f"Coordi total: {summary2['original_counts']['coordi_total']}")
-            print(f"Matched: {summary2['matched_items_count']}")
-            print("✓ Test 2 passed")
-        except Exception as e:
-            print(f"✗ Test 2 failed: {e}")
-    else:
-        print("\n=== TEST 2: Skipped (only one sheet available) ===")
-    
-    # Test 3: Verify no duplicates across sheets (this was the original problem)
-    print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===")
-    for sheet_name in sheet_names:
-        summary = comparator.get_comparison_summary(sheet_name)
-        print(f"Sheet '{sheet_name}':")
-        print(f"  KST duplicates: {summary['mismatches']['kst_duplicates_count']}")
-        print(f"  Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}")
-    
-    print("\n✓ All tests completed!")
-
-if __name__ == "__main__":
-    test_sheet_filtering()