diff --git a/data/Compare DE.xlsx b/data/Compare DE.xlsx new file mode 100644 index 0000000..a74ed0b Binary files /dev/null and b/data/Compare DE.xlsx differ diff --git a/data_comparator.py b/data_comparator.py index 0c78606..968e200 100644 --- a/data_comparator.py +++ b/data_comparator.py @@ -3,7 +3,25 @@ import numpy as np from typing import Dict, List, Tuple, Any from dataclasses import dataclass -@dataclass +def normalize_episode(episode: str) -> str: + """Normalize episode numbers to handle cases like '54' vs '54.0'""" + if not episode or episode.strip() == '': + return episode + + try: + # Convert to float first to handle both int and float formats + episode_float = float(episode.strip()) + + # If it's a whole number (like 54.0), convert to int format + if episode_float.is_integer(): + return str(int(episode_float)) + else: + # Keep decimal format for non-whole numbers + return str(episode_float) + except (ValueError, TypeError): + # If conversion fails, return original episode string + return episode.strip() + class ComparisonItem: """Represents a single item for comparison""" title: str @@ -11,6 +29,12 @@ class ComparisonItem: source_sheet: str row_index: int + def __init__(self, title: str, episode: str, source_sheet: str, row_index: int): + self.title = title + self.episode = normalize_episode(episode) # Normalize episode on creation + self.source_sheet = source_sheet + self.row_index = row_index + def __hash__(self): return hash((self.title, self.episode)) @@ -167,15 +191,23 @@ class KSTCoordiComparator: kst_all_items = sheet_data['kst_all_items'] coordi_all_items = sheet_data['coordi_all_items'] - # Find overlaps and differences - matched_items = kst_items.intersection(coordi_items) - kst_only_items = kst_items - coordi_items - coordi_only_items = coordi_items - kst_items - - # Find duplicates within each dataset - FIXED LOGIC + # Find duplicates within each dataset first kst_duplicates = self._find_duplicates_in_list(kst_all_items) coordi_duplicates = self._find_duplicates_in_list(coordi_all_items) + # Create sets of items that have duplicates (to exclude from "only" lists) + kst_duplicate_keys = {(item.title, item.episode) for item in kst_duplicates} + coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_duplicates} + + # Find overlaps and differences - exclude items that have duplicates + matched_items = kst_items.intersection(coordi_items) + + # For "only" items: exclude those that have duplicates within their own dataset + kst_only_items = {item for item in kst_items - coordi_items + if (item.title, item.episode) not in kst_duplicate_keys} + coordi_only_items = {item for item in coordi_items - kst_items + if (item.title, item.episode) not in coordi_duplicate_keys} + categorization = { 'matched_items': list(matched_items), 'kst_only_items': list(kst_only_items), @@ -245,10 +277,16 @@ class KSTCoordiComparator: kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates} coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates} + # Count actual instances for each item + from collections import Counter + kst_counts = Counter((item.title, item.episode) for item in kst_sheet_items) + coordi_counts = Counter((item.title, item.episode) for item in coordi_sheet_items) + # Find matched items that also have duplicates within the same sheet for title, episode in matched_in_sheet: # Check if this matched item has duplicates in KST within this sheet if (title, episode) in kst_duplicate_keys: + kst_count = kst_counts[(title, episode)] mixed_duplicates.append({ 'title': title, 'episode': episode, @@ -256,11 +294,13 @@ class KSTCoordiComparator: 'row_index': None, # Could get from items if needed 'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}', 'mismatch_type': 'MIXED_DUPLICATE_KST', - 'duplicate_side': 'KST' + 'duplicate_side': 'KST', + 'duplicate_count': kst_count }) # Check if this matched item has duplicates in Coordi within this sheet if (title, episode) in coordi_duplicate_keys: + coordi_count = coordi_counts[(title, episode)] mixed_duplicates.append({ 'title': title, 'episode': episode, @@ -268,7 +308,8 @@ class KSTCoordiComparator: 'row_index': None, # Could get from items if needed 'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}', 'mismatch_type': 'MIXED_DUPLICATE_COORDI', - 'duplicate_side': 'COORDI' + 'duplicate_side': 'COORDI', + 'duplicate_count': coordi_count }) return mixed_duplicates @@ -340,7 +381,7 @@ class KSTCoordiComparator: return mismatch_details - def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]: + def get_comparison_summary(self, sheet_filter: str | None = None) -> Dict[str, Any]: """Get a comprehensive summary of the comparison for a specific sheet only""" # Get sheet names for filtering options sheet_names = list(self.data.keys()) if self.data else [] @@ -467,9 +508,146 @@ class KSTCoordiComparator: return grouped + def generate_visualize_data(self, sheet_filter: str | None = None) -> List[Dict[str, Any]]: + """Generate data structure for Excel-like visualization""" + # Get comparison data for the specified sheet + summary = self.get_comparison_summary(sheet_filter) + mismatch_details = summary['mismatch_details'] + + visualize_rows = [] + + # Helper function to create a row + def create_row(coordi_title="", coordi_chapter="", kst_title="", kst_chapter="", + row_type="matched", reason="", title_for_sort=""): + return { + 'coordi_title': coordi_title, + 'coordi_chapter': coordi_chapter, + 'kst_title': kst_title, + 'kst_chapter': kst_chapter, + 'row_type': row_type, + 'reason': reason, + 'title_for_sort': title_for_sort or coordi_title or kst_title, + 'priority': 1 if row_type != 'matched' else 2 # Mismatches first + } + + # 1. Handle Coordi-only items + for item in mismatch_details['coordi_only']: + visualize_rows.append(create_row( + coordi_title=item['title'], + coordi_chapter=item['episode'], + row_type='coordi_only', + reason='Only in Coordi' + )) + + # 2. Handle KST-only items + for item in mismatch_details['kst_only']: + visualize_rows.append(create_row( + kst_title=item['title'], + kst_chapter=item['episode'], + row_type='kst_only', + reason='Only in KST' + )) + + # 3. Handle Mixed duplicates (exists in both but duplicated on one side) + mixed_items = {} # Group by title+episode + for item in mismatch_details['mixed_duplicates']: + key = f"{item['title']}_{item['episode']}" + if key not in mixed_items: + mixed_items[key] = { + 'title': item['title'], + 'episode': item['episode'], + 'kst_duplicate_count': 0, + 'coordi_duplicate_count': 0 + } + + # Count the actual duplicates for each side + if item['duplicate_side'] == 'KST': + mixed_items[key]['kst_duplicate_count'] = item.get('duplicate_count', 1) + elif item['duplicate_side'] == 'COORDI': + mixed_items[key]['coordi_duplicate_count'] = item.get('duplicate_count', 1) + + for key, item in mixed_items.items(): + # First row: show it exists in both + visualize_rows.append(create_row( + coordi_title=item['title'], + coordi_chapter=item['episode'], + kst_title=item['title'], + kst_chapter=item['episode'], + row_type='mixed_duplicate', + reason='Mixed duplicate' + )) + + # Additional rows for KST duplicates (count - 1 since first is already shown) + for i in range(max(0, item['kst_duplicate_count'] - 1)): + visualize_rows.append(create_row( + kst_title=item['title'], + kst_chapter=item['episode'], + row_type='mixed_duplicate', + reason='Duplicate in KST', + title_for_sort=item['title'] + )) + + # Additional rows for Coordi duplicates (count - 1 since first is already shown) + for i in range(max(0, item['coordi_duplicate_count'] - 1)): + visualize_rows.append(create_row( + coordi_title=item['title'], + coordi_chapter=item['episode'], + row_type='mixed_duplicate', + reason='Duplicate in Coordi', + title_for_sort=item['title'] + )) + + # 4. Handle Pure duplicates + for item in mismatch_details['kst_duplicates']: + visualize_rows.append(create_row( + kst_title=item['title'], + kst_chapter=item['episode'], + row_type='pure_duplicate', + reason='Duplicate in KST' + )) + + for item in mismatch_details['coordi_duplicates']: + visualize_rows.append(create_row( + coordi_title=item['title'], + coordi_chapter=item['episode'], + row_type='pure_duplicate', + reason='Duplicate in Coordi' + )) + + # 5. Handle Matched items (perfect matches) + matched_by_title = summary['grouped_by_title']['matched_by_title'] + for title, items in matched_by_title.items(): + for item in items: + visualize_rows.append(create_row( + coordi_title=item['title'], + coordi_chapter=item['episode'], + kst_title=item['title'], + kst_chapter=item['episode'], + row_type='matched', + reason='Perfect match' + )) + + # Sort: Mismatches first (priority 1), then matches (priority 2), then by Korean title + chapter + def sort_key(x): + # Extract episode number for proper numeric sorting + coordi_episode = x.get('coordi_chapter', '') or '' + kst_episode = x.get('kst_chapter', '') or '' + episode = coordi_episode or kst_episode + + # Try to convert episode to number for proper sorting, fallback to string + try: + episode_num = float(episode) if episode else 0 + except (ValueError, TypeError): + episode_num = 0 + + return (x['priority'], x['title_for_sort'], episode_num) + + visualize_rows.sort(key=sort_key) + + return visualize_rows - def print_comparison_summary(self, sheet_filter: str = None): + def print_comparison_summary(self, sheet_filter: str | None = None): """Print a formatted summary of the comparison for a specific sheet""" summary = self.get_comparison_summary(sheet_filter) diff --git a/templates/index.html b/templates/index.html index bb50abd..4c437d2 100644 --- a/templates/index.html +++ b/templates/index.html @@ -173,6 +173,32 @@ border: 1px solid #ddd; border-radius: 4px; } + + /* Vibrant color styles for Visualize tab */ + .coordi-only-row { + background-color: #ff4444 !important; /* Bright red */ + color: white; + } + + .kst-only-row { + background-color: #4488ff !important; /* Bright blue */ + color: white; + } + + .mixed-duplicate-row { + background-color: #ff8800 !important; /* Bright orange */ + color: white; + } + + .pure-duplicate-row { + background-color: #8844ff !important; /* Bright purple */ + color: white; + } + + .matched-row { + background-color: white !important; /* White background */ + color: black; + } @@ -203,6 +229,7 @@
Summary
Different
+
Visualize
@@ -255,6 +282,25 @@
+ +
+

Data

+
+ + + + + + + + + + + + +
Coordi TitleCoordi ChapterKST TitleKST ChapterStatus
+
+
@@ -468,6 +514,9 @@ // Update Different tab updateDifferentTable(results.mismatch_details); + + // Update Visualize tab + updateVisualizeTable(results.visualize_data); } function updateSummaryTable(matchedData) { @@ -587,6 +636,40 @@ }); } + function updateVisualizeTable(visualizeData) { + const tbody = document.getElementById('visualize-table-body'); + tbody.innerHTML = ''; + + // Data is already sorted by the backend (mismatches first, then matches, all by Korean title) + visualizeData.forEach(row => { + const tr = tbody.insertRow(); + tr.insertCell(0).textContent = row.coordi_title || ''; + tr.insertCell(1).textContent = row.coordi_chapter || ''; + tr.insertCell(2).textContent = row.kst_title || ''; + tr.insertCell(3).textContent = row.kst_chapter || ''; + tr.insertCell(4).textContent = row.reason || ''; + + // Apply vibrant color highlighting based on row type + switch (row.row_type) { + case 'coordi_only': + tr.className = 'coordi-only-row'; + break; + case 'kst_only': + tr.className = 'kst-only-row'; + break; + case 'mixed_duplicate': + tr.className = 'mixed-duplicate-row'; + break; + case 'pure_duplicate': + tr.className = 'pure-duplicate-row'; + break; + case 'matched': + tr.className = 'matched-row'; + break; + } + }); + } + // Auto-analyze on page load with default file window.onload = function() { // Initialize sheet filter with loading state diff --git a/test_ba_confirmed_cases.py b/test_ba_confirmed_cases.py deleted file mode 100644 index 31a417d..0000000 --- a/test_ba_confirmed_cases.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 - -from data_comparator import KSTCoordiComparator - -def test_ba_confirmed_cases(): - """Test that the comparison logic matches BA confirmed expectations""" - print("Testing BA confirmed duplicate cases...") - - # Create comparator and load data - comparator = KSTCoordiComparator("data/sample-data.xlsx") - if not comparator.load_data(): - print("Failed to load data!") - return - - print("\n=== US URGENT Sheet - BA Confirmed Cases ===") - us_summary = comparator.get_comparison_summary('US URGENT') - - # Check for expected duplicates in US URGENT - coordi_duplicates = us_summary['mismatch_details']['coordi_duplicates'] - mixed_duplicates = us_summary['mismatch_details']['mixed_duplicates'] - - expected_coordi_duplicates = [ - ('금수의 영역', '17'), - ('신결', '23') - ] - - expected_mixed_duplicates = [ - ('트윈 가이드', '31') - ] - - print("Coordi duplicates found:") - found_coordi = [] - for item in coordi_duplicates: - key = (item['title'], item['episode']) - found_coordi.append(key) - print(f" - {item['title']} - Episode {item['episode']}") - - print("\nMixed duplicates found:") - found_mixed = [] - for item in mixed_duplicates: - key = (item['title'], item['episode']) - found_mixed.append(key) - print(f" - {item['title']} - Episode {item['episode']} ({item['reason']})") - - # Verify expected cases - print("\n✓ Verification:") - for expected in expected_coordi_duplicates: - if expected in found_coordi: - print(f" ✓ Found expected Coordi duplicate: {expected[0]} - Episode {expected[1]}") - else: - print(f" ✗ Missing expected Coordi duplicate: {expected[0]} - Episode {expected[1]}") - - for expected in expected_mixed_duplicates: - if expected in found_mixed: - print(f" ✓ Found expected mixed duplicate: {expected[0]} - Episode {expected[1]}") - else: - print(f" ✗ Missing expected mixed duplicate: {expected[0]} - Episode {expected[1]}") - - print("\n=== TH URGENT Sheet - BA Confirmed Cases ===") - th_summary = comparator.get_comparison_summary('TH URGENT') - - # Check for expected duplicates in TH URGENT - kst_duplicates = th_summary['mismatch_details']['kst_duplicates'] - coordi_only = th_summary['mismatch_details']['coordi_only'] - - expected_kst_duplicates = [ - ('백라이트', '53-1x(휴재)') - ] - - print("KST duplicates found:") - found_kst = [] - for item in kst_duplicates: - key = (item['title'], item['episode']) - found_kst.append(key) - print(f" - {item['title']} - Episode {item['episode']}") - - # Check that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi - print("\nChecking that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi:") - found_in_coordi = False - for item in coordi_only: - if item['title'] == '백라이트' and item['episode'] == '53-1x(휴재)': - found_in_coordi = True - break - - if not found_in_coordi: - print(" ✓ 백라이트 - Episode 53-1x(휴재) correctly does NOT appear in Coordi data") - else: - print(" ✗ 백라이트 - Episode 53-1x(휴재) incorrectly appears in Coordi data") - - # Verify expected cases - print("\n✓ Verification:") - for expected in expected_kst_duplicates: - if expected in found_kst: - print(f" ✓ Found expected KST duplicate: {expected[0]} - Episode {expected[1]}") - else: - print(f" ✗ Missing expected KST duplicate: {expected[0]} - Episode {expected[1]}") - - print("\n✓ All BA confirmed cases tested!") - -if __name__ == "__main__": - test_ba_confirmed_cases() \ No newline at end of file diff --git a/test_sheet_filtering.py b/test_sheet_filtering.py deleted file mode 100644 index 0d1ef12..0000000 --- a/test_sheet_filtering.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 - -from data_comparator import KSTCoordiComparator - -def test_sheet_filtering(): - """Test that sheet filtering works correctly and defaults to first sheet""" - print("Testing sheet filtering functionality...") - - # Create comparator and load data - comparator = KSTCoordiComparator("data/sample-data.xlsx") - if not comparator.load_data(): - print("Failed to load data!") - return - - print(f"Available sheets: {list(comparator.data.keys())}") - - # Test 1: No sheet filter provided (should default to first sheet) - print("\n=== TEST 1: No sheet filter (should default to first sheet) ===") - try: - summary1 = comparator.get_comparison_summary() - print(f"Default sheet selected: {summary1['current_sheet_filter']}") - print(f"KST total: {summary1['original_counts']['kst_total']}") - print(f"Coordi total: {summary1['original_counts']['coordi_total']}") - print(f"Matched: {summary1['matched_items_count']}") - print("✓ Test 1 passed") - except Exception as e: - print(f"✗ Test 1 failed: {e}") - - # Test 2: Specific sheet filter - sheet_names = list(comparator.data.keys()) - if len(sheet_names) > 1: - second_sheet = sheet_names[1] - print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===") - try: - summary2 = comparator.get_comparison_summary(second_sheet) - print(f"Selected sheet: {summary2['current_sheet_filter']}") - print(f"KST total: {summary2['original_counts']['kst_total']}") - print(f"Coordi total: {summary2['original_counts']['coordi_total']}") - print(f"Matched: {summary2['matched_items_count']}") - print("✓ Test 2 passed") - except Exception as e: - print(f"✗ Test 2 failed: {e}") - else: - print("\n=== TEST 2: Skipped (only one sheet available) ===") - - # Test 3: Verify no duplicates across sheets (this was the original problem) - print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===") - for sheet_name in sheet_names: - summary = comparator.get_comparison_summary(sheet_name) - print(f"Sheet '{sheet_name}':") - print(f" KST duplicates: {summary['mismatches']['kst_duplicates_count']}") - print(f" Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}") - - print("\n✓ All tests completed!") - -if __name__ == "__main__": - test_sheet_filtering() \ No newline at end of file diff --git a/web_gui.py b/web_gui.py index 0510d9f..0e5a547 100644 --- a/web_gui.py +++ b/web_gui.py @@ -61,6 +61,10 @@ def analyze_data(): # Add matched data to results comparison_results['matched_data'] = matched_items_data + # Generate visualize data + visualize_data = comparator_instance.generate_visualize_data(sheet_filter) + comparison_results['visualize_data'] = visualize_data + return jsonify({ 'success': True, 'results': comparison_results @@ -307,6 +311,32 @@ def create_templates_dir(): border: 1px solid #ddd; border-radius: 4px; } + + /* Vibrant color styles for Visualize tab */ + .coordi-only-row { + background-color: #ff4444 !important; /* Bright red */ + color: white; + } + + .kst-only-row { + background-color: #4488ff !important; /* Bright blue */ + color: white; + } + + .mixed-duplicate-row { + background-color: #ff8800 !important; /* Bright orange */ + color: white; + } + + .pure-duplicate-row { + background-color: #8844ff !important; /* Bright purple */ + color: white; + } + + .matched-row { + background-color: white !important; /* White background */ + color: black; + } @@ -337,6 +367,7 @@ def create_templates_dir():
Summary
Different
+
Visualize
@@ -389,6 +420,25 @@ def create_templates_dir():
+ +
+

Data

+
+ + + + + + + + + + + + +
Coordi TitleCoordi ChapterKST TitleKST ChapterStatus
+
+
@@ -602,6 +652,9 @@ def create_templates_dir(): // Update Different tab updateDifferentTable(results.mismatch_details); + + // Update Visualize tab + updateVisualizeTable(results.visualize_data); } function updateSummaryTable(matchedData) { @@ -721,6 +774,40 @@ def create_templates_dir(): }); } + function updateVisualizeTable(visualizeData) { + const tbody = document.getElementById('visualize-table-body'); + tbody.innerHTML = ''; + + // Data is already sorted by the backend (mismatches first, then matches, all by Korean title) + visualizeData.forEach(row => { + const tr = tbody.insertRow(); + tr.insertCell(0).textContent = row.coordi_title || ''; + tr.insertCell(1).textContent = row.coordi_chapter || ''; + tr.insertCell(2).textContent = row.kst_title || ''; + tr.insertCell(3).textContent = row.kst_chapter || ''; + tr.insertCell(4).textContent = row.reason || ''; + + // Apply vibrant color highlighting based on row type + switch (row.row_type) { + case 'coordi_only': + tr.className = 'coordi-only-row'; + break; + case 'kst_only': + tr.className = 'kst-only-row'; + break; + case 'mixed_duplicate': + tr.className = 'mixed-duplicate-row'; + break; + case 'pure_duplicate': + tr.className = 'pure-duplicate-row'; + break; + case 'matched': + tr.className = 'matched-row'; + break; + } + }); + } + // Auto-analyze on page load with default file window.onload = function() { // Initialize sheet filter with loading state