dev/viettran #2
BIN
data/Compare DE.xlsx
Normal file
BIN
data/Compare DE.xlsx
Normal file
Binary file not shown.
@ -191,15 +191,23 @@ class KSTCoordiComparator:
|
||||
kst_all_items = sheet_data['kst_all_items']
|
||||
coordi_all_items = sheet_data['coordi_all_items']
|
||||
|
||||
# Find overlaps and differences
|
||||
matched_items = kst_items.intersection(coordi_items)
|
||||
kst_only_items = kst_items - coordi_items
|
||||
coordi_only_items = coordi_items - kst_items
|
||||
|
||||
# Find duplicates within each dataset - FIXED LOGIC
|
||||
# Find duplicates within each dataset first
|
||||
kst_duplicates = self._find_duplicates_in_list(kst_all_items)
|
||||
coordi_duplicates = self._find_duplicates_in_list(coordi_all_items)
|
||||
|
||||
# Create sets of items that have duplicates (to exclude from "only" lists)
|
||||
kst_duplicate_keys = {(item.title, item.episode) for item in kst_duplicates}
|
||||
coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_duplicates}
|
||||
|
||||
# Find overlaps and differences - exclude items that have duplicates
|
||||
matched_items = kst_items.intersection(coordi_items)
|
||||
|
||||
# For "only" items: exclude those that have duplicates within their own dataset
|
||||
kst_only_items = {item for item in kst_items - coordi_items
|
||||
if (item.title, item.episode) not in kst_duplicate_keys}
|
||||
coordi_only_items = {item for item in coordi_items - kst_items
|
||||
if (item.title, item.episode) not in coordi_duplicate_keys}
|
||||
|
||||
categorization = {
|
||||
'matched_items': list(matched_items),
|
||||
'kst_only_items': list(kst_only_items),
|
||||
@ -269,10 +277,16 @@ class KSTCoordiComparator:
|
||||
kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates}
|
||||
coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates}
|
||||
|
||||
# Count actual instances for each item
|
||||
from collections import Counter
|
||||
kst_counts = Counter((item.title, item.episode) for item in kst_sheet_items)
|
||||
coordi_counts = Counter((item.title, item.episode) for item in coordi_sheet_items)
|
||||
|
||||
# Find matched items that also have duplicates within the same sheet
|
||||
for title, episode in matched_in_sheet:
|
||||
# Check if this matched item has duplicates in KST within this sheet
|
||||
if (title, episode) in kst_duplicate_keys:
|
||||
kst_count = kst_counts[(title, episode)]
|
||||
mixed_duplicates.append({
|
||||
'title': title,
|
||||
'episode': episode,
|
||||
@ -280,11 +294,13 @@ class KSTCoordiComparator:
|
||||
'row_index': None, # Could get from items if needed
|
||||
'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}',
|
||||
'mismatch_type': 'MIXED_DUPLICATE_KST',
|
||||
'duplicate_side': 'KST'
|
||||
'duplicate_side': 'KST',
|
||||
'duplicate_count': kst_count
|
||||
})
|
||||
|
||||
# Check if this matched item has duplicates in Coordi within this sheet
|
||||
if (title, episode) in coordi_duplicate_keys:
|
||||
coordi_count = coordi_counts[(title, episode)]
|
||||
mixed_duplicates.append({
|
||||
'title': title,
|
||||
'episode': episode,
|
||||
@ -292,7 +308,8 @@ class KSTCoordiComparator:
|
||||
'row_index': None, # Could get from items if needed
|
||||
'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}',
|
||||
'mismatch_type': 'MIXED_DUPLICATE_COORDI',
|
||||
'duplicate_side': 'COORDI'
|
||||
'duplicate_side': 'COORDI',
|
||||
'duplicate_count': coordi_count
|
||||
})
|
||||
|
||||
return mixed_duplicates
|
||||
@ -539,14 +556,15 @@ class KSTCoordiComparator:
|
||||
mixed_items[key] = {
|
||||
'title': item['title'],
|
||||
'episode': item['episode'],
|
||||
'has_kst_duplicate': False,
|
||||
'has_coordi_duplicate': False
|
||||
'kst_duplicate_count': 0,
|
||||
'coordi_duplicate_count': 0
|
||||
}
|
||||
|
||||
# Count the actual duplicates for each side
|
||||
if item['duplicate_side'] == 'KST':
|
||||
mixed_items[key]['has_kst_duplicate'] = True
|
||||
mixed_items[key]['kst_duplicate_count'] = item.get('duplicate_count', 1)
|
||||
elif item['duplicate_side'] == 'COORDI':
|
||||
mixed_items[key]['has_coordi_duplicate'] = True
|
||||
mixed_items[key]['coordi_duplicate_count'] = item.get('duplicate_count', 1)
|
||||
|
||||
for key, item in mixed_items.items():
|
||||
# First row: show it exists in both
|
||||
@ -559,8 +577,8 @@ class KSTCoordiComparator:
|
||||
reason='Mixed duplicate'
|
||||
))
|
||||
|
||||
# Additional rows for duplicates
|
||||
if item['has_kst_duplicate']:
|
||||
# Additional rows for KST duplicates (count - 1 since first is already shown)
|
||||
for i in range(max(0, item['kst_duplicate_count'] - 1)):
|
||||
visualize_rows.append(create_row(
|
||||
kst_title=item['title'],
|
||||
kst_chapter=item['episode'],
|
||||
@ -569,7 +587,8 @@ class KSTCoordiComparator:
|
||||
title_for_sort=item['title']
|
||||
))
|
||||
|
||||
if item['has_coordi_duplicate']:
|
||||
# Additional rows for Coordi duplicates (count - 1 since first is already shown)
|
||||
for i in range(max(0, item['coordi_duplicate_count'] - 1)):
|
||||
visualize_rows.append(create_row(
|
||||
coordi_title=item['title'],
|
||||
coordi_chapter=item['episode'],
|
||||
|
||||
@ -1,101 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from data_comparator import KSTCoordiComparator
|
||||
|
||||
def test_ba_confirmed_cases():
|
||||
"""Test that the comparison logic matches BA confirmed expectations"""
|
||||
print("Testing BA confirmed duplicate cases...")
|
||||
|
||||
# Create comparator and load data
|
||||
comparator = KSTCoordiComparator("data/sample-data.xlsx")
|
||||
if not comparator.load_data():
|
||||
print("Failed to load data!")
|
||||
return
|
||||
|
||||
print("\n=== US URGENT Sheet - BA Confirmed Cases ===")
|
||||
us_summary = comparator.get_comparison_summary('US URGENT')
|
||||
|
||||
# Check for expected duplicates in US URGENT
|
||||
coordi_duplicates = us_summary['mismatch_details']['coordi_duplicates']
|
||||
mixed_duplicates = us_summary['mismatch_details']['mixed_duplicates']
|
||||
|
||||
expected_coordi_duplicates = [
|
||||
('금수의 영역', '17'),
|
||||
('신결', '23')
|
||||
]
|
||||
|
||||
expected_mixed_duplicates = [
|
||||
('트윈 가이드', '31')
|
||||
]
|
||||
|
||||
print("Coordi duplicates found:")
|
||||
found_coordi = []
|
||||
for item in coordi_duplicates:
|
||||
key = (item['title'], item['episode'])
|
||||
found_coordi.append(key)
|
||||
print(f" - {item['title']} - Episode {item['episode']}")
|
||||
|
||||
print("\nMixed duplicates found:")
|
||||
found_mixed = []
|
||||
for item in mixed_duplicates:
|
||||
key = (item['title'], item['episode'])
|
||||
found_mixed.append(key)
|
||||
print(f" - {item['title']} - Episode {item['episode']} ({item['reason']})")
|
||||
|
||||
# Verify expected cases
|
||||
print("\n✓ Verification:")
|
||||
for expected in expected_coordi_duplicates:
|
||||
if expected in found_coordi:
|
||||
print(f" ✓ Found expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
|
||||
else:
|
||||
print(f" ✗ Missing expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
|
||||
|
||||
for expected in expected_mixed_duplicates:
|
||||
if expected in found_mixed:
|
||||
print(f" ✓ Found expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
|
||||
else:
|
||||
print(f" ✗ Missing expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
|
||||
|
||||
print("\n=== TH URGENT Sheet - BA Confirmed Cases ===")
|
||||
th_summary = comparator.get_comparison_summary('TH URGENT')
|
||||
|
||||
# Check for expected duplicates in TH URGENT
|
||||
kst_duplicates = th_summary['mismatch_details']['kst_duplicates']
|
||||
coordi_only = th_summary['mismatch_details']['coordi_only']
|
||||
|
||||
expected_kst_duplicates = [
|
||||
('백라이트', '53-1x(휴재)')
|
||||
]
|
||||
|
||||
print("KST duplicates found:")
|
||||
found_kst = []
|
||||
for item in kst_duplicates:
|
||||
key = (item['title'], item['episode'])
|
||||
found_kst.append(key)
|
||||
print(f" - {item['title']} - Episode {item['episode']}")
|
||||
|
||||
# Check that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi
|
||||
print("\nChecking that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi:")
|
||||
found_in_coordi = False
|
||||
for item in coordi_only:
|
||||
if item['title'] == '백라이트' and item['episode'] == '53-1x(휴재)':
|
||||
found_in_coordi = True
|
||||
break
|
||||
|
||||
if not found_in_coordi:
|
||||
print(" ✓ 백라이트 - Episode 53-1x(휴재) correctly does NOT appear in Coordi data")
|
||||
else:
|
||||
print(" ✗ 백라이트 - Episode 53-1x(휴재) incorrectly appears in Coordi data")
|
||||
|
||||
# Verify expected cases
|
||||
print("\n✓ Verification:")
|
||||
for expected in expected_kst_duplicates:
|
||||
if expected in found_kst:
|
||||
print(f" ✓ Found expected KST duplicate: {expected[0]} - Episode {expected[1]}")
|
||||
else:
|
||||
print(f" ✗ Missing expected KST duplicate: {expected[0]} - Episode {expected[1]}")
|
||||
|
||||
print("\n✓ All BA confirmed cases tested!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_ba_confirmed_cases()
|
||||
@ -1,57 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from data_comparator import KSTCoordiComparator
|
||||
|
||||
def test_sheet_filtering():
|
||||
"""Test that sheet filtering works correctly and defaults to first sheet"""
|
||||
print("Testing sheet filtering functionality...")
|
||||
|
||||
# Create comparator and load data
|
||||
comparator = KSTCoordiComparator("data/sample-data.xlsx")
|
||||
if not comparator.load_data():
|
||||
print("Failed to load data!")
|
||||
return
|
||||
|
||||
print(f"Available sheets: {list(comparator.data.keys())}")
|
||||
|
||||
# Test 1: No sheet filter provided (should default to first sheet)
|
||||
print("\n=== TEST 1: No sheet filter (should default to first sheet) ===")
|
||||
try:
|
||||
summary1 = comparator.get_comparison_summary()
|
||||
print(f"Default sheet selected: {summary1['current_sheet_filter']}")
|
||||
print(f"KST total: {summary1['original_counts']['kst_total']}")
|
||||
print(f"Coordi total: {summary1['original_counts']['coordi_total']}")
|
||||
print(f"Matched: {summary1['matched_items_count']}")
|
||||
print("✓ Test 1 passed")
|
||||
except Exception as e:
|
||||
print(f"✗ Test 1 failed: {e}")
|
||||
|
||||
# Test 2: Specific sheet filter
|
||||
sheet_names = list(comparator.data.keys())
|
||||
if len(sheet_names) > 1:
|
||||
second_sheet = sheet_names[1]
|
||||
print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===")
|
||||
try:
|
||||
summary2 = comparator.get_comparison_summary(second_sheet)
|
||||
print(f"Selected sheet: {summary2['current_sheet_filter']}")
|
||||
print(f"KST total: {summary2['original_counts']['kst_total']}")
|
||||
print(f"Coordi total: {summary2['original_counts']['coordi_total']}")
|
||||
print(f"Matched: {summary2['matched_items_count']}")
|
||||
print("✓ Test 2 passed")
|
||||
except Exception as e:
|
||||
print(f"✗ Test 2 failed: {e}")
|
||||
else:
|
||||
print("\n=== TEST 2: Skipped (only one sheet available) ===")
|
||||
|
||||
# Test 3: Verify no duplicates across sheets (this was the original problem)
|
||||
print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===")
|
||||
for sheet_name in sheet_names:
|
||||
summary = comparator.get_comparison_summary(sheet_name)
|
||||
print(f"Sheet '{sheet_name}':")
|
||||
print(f" KST duplicates: {summary['mismatches']['kst_duplicates_count']}")
|
||||
print(f" Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}")
|
||||
|
||||
print("\n✓ All tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_sheet_filtering()
|
||||
Loading…
Reference in New Issue
Block a user