diff --git a/data/Compare DE.xlsx b/data/Compare DE.xlsx new file mode 100644 index 0000000..a74ed0b Binary files /dev/null and b/data/Compare DE.xlsx differ diff --git a/data_comparator.py b/data_comparator.py index b5005c2..968e200 100644 --- a/data_comparator.py +++ b/data_comparator.py @@ -191,15 +191,23 @@ class KSTCoordiComparator: kst_all_items = sheet_data['kst_all_items'] coordi_all_items = sheet_data['coordi_all_items'] - # Find overlaps and differences - matched_items = kst_items.intersection(coordi_items) - kst_only_items = kst_items - coordi_items - coordi_only_items = coordi_items - kst_items - - # Find duplicates within each dataset - FIXED LOGIC + # Find duplicates within each dataset first kst_duplicates = self._find_duplicates_in_list(kst_all_items) coordi_duplicates = self._find_duplicates_in_list(coordi_all_items) + # Create sets of items that have duplicates (to exclude from "only" lists) + kst_duplicate_keys = {(item.title, item.episode) for item in kst_duplicates} + coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_duplicates} + + # Find overlaps and differences - exclude items that have duplicates + matched_items = kst_items.intersection(coordi_items) + + # For "only" items: exclude those that have duplicates within their own dataset + kst_only_items = {item for item in kst_items - coordi_items + if (item.title, item.episode) not in kst_duplicate_keys} + coordi_only_items = {item for item in coordi_items - kst_items + if (item.title, item.episode) not in coordi_duplicate_keys} + categorization = { 'matched_items': list(matched_items), 'kst_only_items': list(kst_only_items), @@ -269,10 +277,16 @@ class KSTCoordiComparator: kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates} coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates} + # Count actual instances for each item + from collections import Counter + kst_counts = Counter((item.title, item.episode) for item in kst_sheet_items) + coordi_counts = Counter((item.title, item.episode) for item in coordi_sheet_items) + # Find matched items that also have duplicates within the same sheet for title, episode in matched_in_sheet: # Check if this matched item has duplicates in KST within this sheet if (title, episode) in kst_duplicate_keys: + kst_count = kst_counts[(title, episode)] mixed_duplicates.append({ 'title': title, 'episode': episode, @@ -280,11 +294,13 @@ class KSTCoordiComparator: 'row_index': None, # Could get from items if needed 'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}', 'mismatch_type': 'MIXED_DUPLICATE_KST', - 'duplicate_side': 'KST' + 'duplicate_side': 'KST', + 'duplicate_count': kst_count }) # Check if this matched item has duplicates in Coordi within this sheet if (title, episode) in coordi_duplicate_keys: + coordi_count = coordi_counts[(title, episode)] mixed_duplicates.append({ 'title': title, 'episode': episode, @@ -292,7 +308,8 @@ class KSTCoordiComparator: 'row_index': None, # Could get from items if needed 'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}', 'mismatch_type': 'MIXED_DUPLICATE_COORDI', - 'duplicate_side': 'COORDI' + 'duplicate_side': 'COORDI', + 'duplicate_count': coordi_count }) return mixed_duplicates @@ -539,14 +556,15 @@ class KSTCoordiComparator: mixed_items[key] = { 'title': item['title'], 'episode': item['episode'], - 'has_kst_duplicate': False, - 'has_coordi_duplicate': False + 'kst_duplicate_count': 0, + 'coordi_duplicate_count': 0 } + # Count the actual duplicates for each side if item['duplicate_side'] == 'KST': - mixed_items[key]['has_kst_duplicate'] = True + mixed_items[key]['kst_duplicate_count'] = item.get('duplicate_count', 1) elif item['duplicate_side'] == 'COORDI': - mixed_items[key]['has_coordi_duplicate'] = True + mixed_items[key]['coordi_duplicate_count'] = item.get('duplicate_count', 1) for key, item in mixed_items.items(): # First row: show it exists in both @@ -559,8 +577,8 @@ class KSTCoordiComparator: reason='Mixed duplicate' )) - # Additional rows for duplicates - if item['has_kst_duplicate']: + # Additional rows for KST duplicates (count - 1 since first is already shown) + for i in range(max(0, item['kst_duplicate_count'] - 1)): visualize_rows.append(create_row( kst_title=item['title'], kst_chapter=item['episode'], @@ -569,7 +587,8 @@ class KSTCoordiComparator: title_for_sort=item['title'] )) - if item['has_coordi_duplicate']: + # Additional rows for Coordi duplicates (count - 1 since first is already shown) + for i in range(max(0, item['coordi_duplicate_count'] - 1)): visualize_rows.append(create_row( coordi_title=item['title'], coordi_chapter=item['episode'], diff --git a/test_ba_confirmed_cases.py b/test_ba_confirmed_cases.py deleted file mode 100644 index 31a417d..0000000 --- a/test_ba_confirmed_cases.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 - -from data_comparator import KSTCoordiComparator - -def test_ba_confirmed_cases(): - """Test that the comparison logic matches BA confirmed expectations""" - print("Testing BA confirmed duplicate cases...") - - # Create comparator and load data - comparator = KSTCoordiComparator("data/sample-data.xlsx") - if not comparator.load_data(): - print("Failed to load data!") - return - - print("\n=== US URGENT Sheet - BA Confirmed Cases ===") - us_summary = comparator.get_comparison_summary('US URGENT') - - # Check for expected duplicates in US URGENT - coordi_duplicates = us_summary['mismatch_details']['coordi_duplicates'] - mixed_duplicates = us_summary['mismatch_details']['mixed_duplicates'] - - expected_coordi_duplicates = [ - ('금수의 영역', '17'), - ('신결', '23') - ] - - expected_mixed_duplicates = [ - ('트윈 가이드', '31') - ] - - print("Coordi duplicates found:") - found_coordi = [] - for item in coordi_duplicates: - key = (item['title'], item['episode']) - found_coordi.append(key) - print(f" - {item['title']} - Episode {item['episode']}") - - print("\nMixed duplicates found:") - found_mixed = [] - for item in mixed_duplicates: - key = (item['title'], item['episode']) - found_mixed.append(key) - print(f" - {item['title']} - Episode {item['episode']} ({item['reason']})") - - # Verify expected cases - print("\n✓ Verification:") - for expected in expected_coordi_duplicates: - if expected in found_coordi: - print(f" ✓ Found expected Coordi duplicate: {expected[0]} - Episode {expected[1]}") - else: - print(f" ✗ Missing expected Coordi duplicate: {expected[0]} - Episode {expected[1]}") - - for expected in expected_mixed_duplicates: - if expected in found_mixed: - print(f" ✓ Found expected mixed duplicate: {expected[0]} - Episode {expected[1]}") - else: - print(f" ✗ Missing expected mixed duplicate: {expected[0]} - Episode {expected[1]}") - - print("\n=== TH URGENT Sheet - BA Confirmed Cases ===") - th_summary = comparator.get_comparison_summary('TH URGENT') - - # Check for expected duplicates in TH URGENT - kst_duplicates = th_summary['mismatch_details']['kst_duplicates'] - coordi_only = th_summary['mismatch_details']['coordi_only'] - - expected_kst_duplicates = [ - ('백라이트', '53-1x(휴재)') - ] - - print("KST duplicates found:") - found_kst = [] - for item in kst_duplicates: - key = (item['title'], item['episode']) - found_kst.append(key) - print(f" - {item['title']} - Episode {item['episode']}") - - # Check that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi - print("\nChecking that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi:") - found_in_coordi = False - for item in coordi_only: - if item['title'] == '백라이트' and item['episode'] == '53-1x(휴재)': - found_in_coordi = True - break - - if not found_in_coordi: - print(" ✓ 백라이트 - Episode 53-1x(휴재) correctly does NOT appear in Coordi data") - else: - print(" ✗ 백라이트 - Episode 53-1x(휴재) incorrectly appears in Coordi data") - - # Verify expected cases - print("\n✓ Verification:") - for expected in expected_kst_duplicates: - if expected in found_kst: - print(f" ✓ Found expected KST duplicate: {expected[0]} - Episode {expected[1]}") - else: - print(f" ✗ Missing expected KST duplicate: {expected[0]} - Episode {expected[1]}") - - print("\n✓ All BA confirmed cases tested!") - -if __name__ == "__main__": - test_ba_confirmed_cases() \ No newline at end of file diff --git a/test_sheet_filtering.py b/test_sheet_filtering.py deleted file mode 100644 index 0d1ef12..0000000 --- a/test_sheet_filtering.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 - -from data_comparator import KSTCoordiComparator - -def test_sheet_filtering(): - """Test that sheet filtering works correctly and defaults to first sheet""" - print("Testing sheet filtering functionality...") - - # Create comparator and load data - comparator = KSTCoordiComparator("data/sample-data.xlsx") - if not comparator.load_data(): - print("Failed to load data!") - return - - print(f"Available sheets: {list(comparator.data.keys())}") - - # Test 1: No sheet filter provided (should default to first sheet) - print("\n=== TEST 1: No sheet filter (should default to first sheet) ===") - try: - summary1 = comparator.get_comparison_summary() - print(f"Default sheet selected: {summary1['current_sheet_filter']}") - print(f"KST total: {summary1['original_counts']['kst_total']}") - print(f"Coordi total: {summary1['original_counts']['coordi_total']}") - print(f"Matched: {summary1['matched_items_count']}") - print("✓ Test 1 passed") - except Exception as e: - print(f"✗ Test 1 failed: {e}") - - # Test 2: Specific sheet filter - sheet_names = list(comparator.data.keys()) - if len(sheet_names) > 1: - second_sheet = sheet_names[1] - print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===") - try: - summary2 = comparator.get_comparison_summary(second_sheet) - print(f"Selected sheet: {summary2['current_sheet_filter']}") - print(f"KST total: {summary2['original_counts']['kst_total']}") - print(f"Coordi total: {summary2['original_counts']['coordi_total']}") - print(f"Matched: {summary2['matched_items_count']}") - print("✓ Test 2 passed") - except Exception as e: - print(f"✗ Test 2 failed: {e}") - else: - print("\n=== TEST 2: Skipped (only one sheet available) ===") - - # Test 3: Verify no duplicates across sheets (this was the original problem) - print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===") - for sheet_name in sheet_names: - summary = comparator.get_comparison_summary(sheet_name) - print(f"Sheet '{sheet_name}':") - print(f" KST duplicates: {summary['mismatches']['kst_duplicates_count']}") - print(f" Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}") - - print("\n✓ All tests completed!") - -if __name__ == "__main__": - test_sheet_filtering() \ No newline at end of file