diff --git a/data/Compare DE.xlsx b/data/Compare DE.xlsx
new file mode 100644
index 0000000..a74ed0b
Binary files /dev/null and b/data/Compare DE.xlsx differ
diff --git a/data_comparator.py b/data_comparator.py
index 0c78606..968e200 100644
--- a/data_comparator.py
+++ b/data_comparator.py
@@ -3,7 +3,25 @@ import numpy as np
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass
-@dataclass
+def normalize_episode(episode: str) -> str:
+ """Normalize episode numbers to handle cases like '54' vs '54.0'"""
+ if not episode or episode.strip() == '':
+ return episode
+
+ try:
+ # Convert to float first to handle both int and float formats
+ episode_float = float(episode.strip())
+
+ # If it's a whole number (like 54.0), convert to int format
+ if episode_float.is_integer():
+ return str(int(episode_float))
+ else:
+ # Keep decimal format for non-whole numbers
+ return str(episode_float)
+ except (ValueError, TypeError):
+ # If conversion fails, return original episode string
+ return episode.strip()
+
class ComparisonItem:
"""Represents a single item for comparison"""
title: str
@@ -11,6 +29,12 @@ class ComparisonItem:
source_sheet: str
row_index: int
+ def __init__(self, title: str, episode: str, source_sheet: str, row_index: int):
+ self.title = title
+ self.episode = normalize_episode(episode) # Normalize episode on creation
+ self.source_sheet = source_sheet
+ self.row_index = row_index
+
def __hash__(self):
return hash((self.title, self.episode))
@@ -167,15 +191,23 @@ class KSTCoordiComparator:
kst_all_items = sheet_data['kst_all_items']
coordi_all_items = sheet_data['coordi_all_items']
- # Find overlaps and differences
- matched_items = kst_items.intersection(coordi_items)
- kst_only_items = kst_items - coordi_items
- coordi_only_items = coordi_items - kst_items
-
- # Find duplicates within each dataset - FIXED LOGIC
+ # Find duplicates within each dataset first
kst_duplicates = self._find_duplicates_in_list(kst_all_items)
coordi_duplicates = self._find_duplicates_in_list(coordi_all_items)
+ # Create sets of items that have duplicates (to exclude from "only" lists)
+ kst_duplicate_keys = {(item.title, item.episode) for item in kst_duplicates}
+ coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_duplicates}
+
+ # Find overlaps and differences - exclude items that have duplicates
+ matched_items = kst_items.intersection(coordi_items)
+
+ # For "only" items: exclude those that have duplicates within their own dataset
+ kst_only_items = {item for item in kst_items - coordi_items
+ if (item.title, item.episode) not in kst_duplicate_keys}
+ coordi_only_items = {item for item in coordi_items - kst_items
+ if (item.title, item.episode) not in coordi_duplicate_keys}
+
categorization = {
'matched_items': list(matched_items),
'kst_only_items': list(kst_only_items),
@@ -245,10 +277,16 @@ class KSTCoordiComparator:
kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates}
coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates}
+ # Count actual instances for each item
+ from collections import Counter
+ kst_counts = Counter((item.title, item.episode) for item in kst_sheet_items)
+ coordi_counts = Counter((item.title, item.episode) for item in coordi_sheet_items)
+
# Find matched items that also have duplicates within the same sheet
for title, episode in matched_in_sheet:
# Check if this matched item has duplicates in KST within this sheet
if (title, episode) in kst_duplicate_keys:
+ kst_count = kst_counts[(title, episode)]
mixed_duplicates.append({
'title': title,
'episode': episode,
@@ -256,11 +294,13 @@ class KSTCoordiComparator:
'row_index': None, # Could get from items if needed
'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}',
'mismatch_type': 'MIXED_DUPLICATE_KST',
- 'duplicate_side': 'KST'
+ 'duplicate_side': 'KST',
+ 'duplicate_count': kst_count
})
# Check if this matched item has duplicates in Coordi within this sheet
if (title, episode) in coordi_duplicate_keys:
+ coordi_count = coordi_counts[(title, episode)]
mixed_duplicates.append({
'title': title,
'episode': episode,
@@ -268,7 +308,8 @@ class KSTCoordiComparator:
'row_index': None, # Could get from items if needed
'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}',
'mismatch_type': 'MIXED_DUPLICATE_COORDI',
- 'duplicate_side': 'COORDI'
+ 'duplicate_side': 'COORDI',
+ 'duplicate_count': coordi_count
})
return mixed_duplicates
@@ -340,7 +381,7 @@ class KSTCoordiComparator:
return mismatch_details
- def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]:
+ def get_comparison_summary(self, sheet_filter: str | None = None) -> Dict[str, Any]:
"""Get a comprehensive summary of the comparison for a specific sheet only"""
# Get sheet names for filtering options
sheet_names = list(self.data.keys()) if self.data else []
@@ -467,9 +508,146 @@ class KSTCoordiComparator:
return grouped
+ def generate_visualize_data(self, sheet_filter: str | None = None) -> List[Dict[str, Any]]:
+ """Generate data structure for Excel-like visualization"""
+ # Get comparison data for the specified sheet
+ summary = self.get_comparison_summary(sheet_filter)
+ mismatch_details = summary['mismatch_details']
+
+ visualize_rows = []
+
+ # Helper function to create a row
+ def create_row(coordi_title="", coordi_chapter="", kst_title="", kst_chapter="",
+ row_type="matched", reason="", title_for_sort=""):
+ return {
+ 'coordi_title': coordi_title,
+ 'coordi_chapter': coordi_chapter,
+ 'kst_title': kst_title,
+ 'kst_chapter': kst_chapter,
+ 'row_type': row_type,
+ 'reason': reason,
+ 'title_for_sort': title_for_sort or coordi_title or kst_title,
+ 'priority': 1 if row_type != 'matched' else 2 # Mismatches first
+ }
+
+ # 1. Handle Coordi-only items
+ for item in mismatch_details['coordi_only']:
+ visualize_rows.append(create_row(
+ coordi_title=item['title'],
+ coordi_chapter=item['episode'],
+ row_type='coordi_only',
+ reason='Only in Coordi'
+ ))
+
+ # 2. Handle KST-only items
+ for item in mismatch_details['kst_only']:
+ visualize_rows.append(create_row(
+ kst_title=item['title'],
+ kst_chapter=item['episode'],
+ row_type='kst_only',
+ reason='Only in KST'
+ ))
+
+ # 3. Handle Mixed duplicates (exists in both but duplicated on one side)
+ mixed_items = {} # Group by title+episode
+ for item in mismatch_details['mixed_duplicates']:
+ key = f"{item['title']}_{item['episode']}"
+ if key not in mixed_items:
+ mixed_items[key] = {
+ 'title': item['title'],
+ 'episode': item['episode'],
+ 'kst_duplicate_count': 0,
+ 'coordi_duplicate_count': 0
+ }
+
+ # Count the actual duplicates for each side
+ if item['duplicate_side'] == 'KST':
+ mixed_items[key]['kst_duplicate_count'] = item.get('duplicate_count', 1)
+ elif item['duplicate_side'] == 'COORDI':
+ mixed_items[key]['coordi_duplicate_count'] = item.get('duplicate_count', 1)
+
+ for key, item in mixed_items.items():
+ # First row: show it exists in both
+ visualize_rows.append(create_row(
+ coordi_title=item['title'],
+ coordi_chapter=item['episode'],
+ kst_title=item['title'],
+ kst_chapter=item['episode'],
+ row_type='mixed_duplicate',
+ reason='Mixed duplicate'
+ ))
+
+ # Additional rows for KST duplicates (count - 1 since first is already shown)
+ for i in range(max(0, item['kst_duplicate_count'] - 1)):
+ visualize_rows.append(create_row(
+ kst_title=item['title'],
+ kst_chapter=item['episode'],
+ row_type='mixed_duplicate',
+ reason='Duplicate in KST',
+ title_for_sort=item['title']
+ ))
+
+ # Additional rows for Coordi duplicates (count - 1 since first is already shown)
+ for i in range(max(0, item['coordi_duplicate_count'] - 1)):
+ visualize_rows.append(create_row(
+ coordi_title=item['title'],
+ coordi_chapter=item['episode'],
+ row_type='mixed_duplicate',
+ reason='Duplicate in Coordi',
+ title_for_sort=item['title']
+ ))
+
+ # 4. Handle Pure duplicates
+ for item in mismatch_details['kst_duplicates']:
+ visualize_rows.append(create_row(
+ kst_title=item['title'],
+ kst_chapter=item['episode'],
+ row_type='pure_duplicate',
+ reason='Duplicate in KST'
+ ))
+
+ for item in mismatch_details['coordi_duplicates']:
+ visualize_rows.append(create_row(
+ coordi_title=item['title'],
+ coordi_chapter=item['episode'],
+ row_type='pure_duplicate',
+ reason='Duplicate in Coordi'
+ ))
+
+ # 5. Handle Matched items (perfect matches)
+ matched_by_title = summary['grouped_by_title']['matched_by_title']
+ for title, items in matched_by_title.items():
+ for item in items:
+ visualize_rows.append(create_row(
+ coordi_title=item['title'],
+ coordi_chapter=item['episode'],
+ kst_title=item['title'],
+ kst_chapter=item['episode'],
+ row_type='matched',
+ reason='Perfect match'
+ ))
+
+ # Sort: Mismatches first (priority 1), then matches (priority 2), then by Korean title + chapter
+ def sort_key(x):
+ # Extract episode number for proper numeric sorting
+ coordi_episode = x.get('coordi_chapter', '') or ''
+ kst_episode = x.get('kst_chapter', '') or ''
+ episode = coordi_episode or kst_episode
+
+ # Try to convert episode to number for proper sorting, fallback to string
+ try:
+ episode_num = float(episode) if episode else 0
+ except (ValueError, TypeError):
+ episode_num = 0
+
+ return (x['priority'], x['title_for_sort'], episode_num)
+
+ visualize_rows.sort(key=sort_key)
+
+ return visualize_rows
- def print_comparison_summary(self, sheet_filter: str = None):
+ def print_comparison_summary(self, sheet_filter: str | None = None):
"""Print a formatted summary of the comparison for a specific sheet"""
summary = self.get_comparison_summary(sheet_filter)
diff --git a/templates/index.html b/templates/index.html
index bb50abd..4c437d2 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -173,6 +173,32 @@
border: 1px solid #ddd;
border-radius: 4px;
}
+
+ /* Vibrant color styles for Visualize tab */
+ .coordi-only-row {
+ background-color: #ff4444 !important; /* Bright red */
+ color: white;
+ }
+
+ .kst-only-row {
+ background-color: #4488ff !important; /* Bright blue */
+ color: white;
+ }
+
+ .mixed-duplicate-row {
+ background-color: #ff8800 !important; /* Bright orange */
+ color: white;
+ }
+
+ .pure-duplicate-row {
+ background-color: #8844ff !important; /* Bright purple */
+ color: white;
+ }
+
+ .matched-row {
+ background-color: white !important; /* White background */
+ color: black;
+ }
@@ -203,6 +229,7 @@
Summary
Different
+
Visualize
@@ -255,6 +282,25 @@
+
+
+
Data
+
+
+
+
+ | Coordi Title |
+ Coordi Chapter |
+ KST Title |
+ KST Chapter |
+ Status |
+
+
+
+
+
+
+
@@ -468,6 +514,9 @@
// Update Different tab
updateDifferentTable(results.mismatch_details);
+
+ // Update Visualize tab
+ updateVisualizeTable(results.visualize_data);
}
function updateSummaryTable(matchedData) {
@@ -587,6 +636,40 @@
});
}
+ function updateVisualizeTable(visualizeData) {
+ const tbody = document.getElementById('visualize-table-body');
+ tbody.innerHTML = '';
+
+ // Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
+ visualizeData.forEach(row => {
+ const tr = tbody.insertRow();
+ tr.insertCell(0).textContent = row.coordi_title || '';
+ tr.insertCell(1).textContent = row.coordi_chapter || '';
+ tr.insertCell(2).textContent = row.kst_title || '';
+ tr.insertCell(3).textContent = row.kst_chapter || '';
+ tr.insertCell(4).textContent = row.reason || '';
+
+ // Apply vibrant color highlighting based on row type
+ switch (row.row_type) {
+ case 'coordi_only':
+ tr.className = 'coordi-only-row';
+ break;
+ case 'kst_only':
+ tr.className = 'kst-only-row';
+ break;
+ case 'mixed_duplicate':
+ tr.className = 'mixed-duplicate-row';
+ break;
+ case 'pure_duplicate':
+ tr.className = 'pure-duplicate-row';
+ break;
+ case 'matched':
+ tr.className = 'matched-row';
+ break;
+ }
+ });
+ }
+
// Auto-analyze on page load with default file
window.onload = function() {
// Initialize sheet filter with loading state
diff --git a/test_ba_confirmed_cases.py b/test_ba_confirmed_cases.py
deleted file mode 100644
index 31a417d..0000000
--- a/test_ba_confirmed_cases.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-
-from data_comparator import KSTCoordiComparator
-
-def test_ba_confirmed_cases():
- """Test that the comparison logic matches BA confirmed expectations"""
- print("Testing BA confirmed duplicate cases...")
-
- # Create comparator and load data
- comparator = KSTCoordiComparator("data/sample-data.xlsx")
- if not comparator.load_data():
- print("Failed to load data!")
- return
-
- print("\n=== US URGENT Sheet - BA Confirmed Cases ===")
- us_summary = comparator.get_comparison_summary('US URGENT')
-
- # Check for expected duplicates in US URGENT
- coordi_duplicates = us_summary['mismatch_details']['coordi_duplicates']
- mixed_duplicates = us_summary['mismatch_details']['mixed_duplicates']
-
- expected_coordi_duplicates = [
- ('금수의 영역', '17'),
- ('신결', '23')
- ]
-
- expected_mixed_duplicates = [
- ('트윈 가이드', '31')
- ]
-
- print("Coordi duplicates found:")
- found_coordi = []
- for item in coordi_duplicates:
- key = (item['title'], item['episode'])
- found_coordi.append(key)
- print(f" - {item['title']} - Episode {item['episode']}")
-
- print("\nMixed duplicates found:")
- found_mixed = []
- for item in mixed_duplicates:
- key = (item['title'], item['episode'])
- found_mixed.append(key)
- print(f" - {item['title']} - Episode {item['episode']} ({item['reason']})")
-
- # Verify expected cases
- print("\n✓ Verification:")
- for expected in expected_coordi_duplicates:
- if expected in found_coordi:
- print(f" ✓ Found expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
- else:
- print(f" ✗ Missing expected Coordi duplicate: {expected[0]} - Episode {expected[1]}")
-
- for expected in expected_mixed_duplicates:
- if expected in found_mixed:
- print(f" ✓ Found expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
- else:
- print(f" ✗ Missing expected mixed duplicate: {expected[0]} - Episode {expected[1]}")
-
- print("\n=== TH URGENT Sheet - BA Confirmed Cases ===")
- th_summary = comparator.get_comparison_summary('TH URGENT')
-
- # Check for expected duplicates in TH URGENT
- kst_duplicates = th_summary['mismatch_details']['kst_duplicates']
- coordi_only = th_summary['mismatch_details']['coordi_only']
-
- expected_kst_duplicates = [
- ('백라이트', '53-1x(휴재)')
- ]
-
- print("KST duplicates found:")
- found_kst = []
- for item in kst_duplicates:
- key = (item['title'], item['episode'])
- found_kst.append(key)
- print(f" - {item['title']} - Episode {item['episode']}")
-
- # Check that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi
- print("\nChecking that 백라이트 - Episode 53-1x(휴재) doesn't appear in Coordi:")
- found_in_coordi = False
- for item in coordi_only:
- if item['title'] == '백라이트' and item['episode'] == '53-1x(휴재)':
- found_in_coordi = True
- break
-
- if not found_in_coordi:
- print(" ✓ 백라이트 - Episode 53-1x(휴재) correctly does NOT appear in Coordi data")
- else:
- print(" ✗ 백라이트 - Episode 53-1x(휴재) incorrectly appears in Coordi data")
-
- # Verify expected cases
- print("\n✓ Verification:")
- for expected in expected_kst_duplicates:
- if expected in found_kst:
- print(f" ✓ Found expected KST duplicate: {expected[0]} - Episode {expected[1]}")
- else:
- print(f" ✗ Missing expected KST duplicate: {expected[0]} - Episode {expected[1]}")
-
- print("\n✓ All BA confirmed cases tested!")
-
-if __name__ == "__main__":
- test_ba_confirmed_cases()
\ No newline at end of file
diff --git a/test_sheet_filtering.py b/test_sheet_filtering.py
deleted file mode 100644
index 0d1ef12..0000000
--- a/test_sheet_filtering.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python3
-
-from data_comparator import KSTCoordiComparator
-
-def test_sheet_filtering():
- """Test that sheet filtering works correctly and defaults to first sheet"""
- print("Testing sheet filtering functionality...")
-
- # Create comparator and load data
- comparator = KSTCoordiComparator("data/sample-data.xlsx")
- if not comparator.load_data():
- print("Failed to load data!")
- return
-
- print(f"Available sheets: {list(comparator.data.keys())}")
-
- # Test 1: No sheet filter provided (should default to first sheet)
- print("\n=== TEST 1: No sheet filter (should default to first sheet) ===")
- try:
- summary1 = comparator.get_comparison_summary()
- print(f"Default sheet selected: {summary1['current_sheet_filter']}")
- print(f"KST total: {summary1['original_counts']['kst_total']}")
- print(f"Coordi total: {summary1['original_counts']['coordi_total']}")
- print(f"Matched: {summary1['matched_items_count']}")
- print("✓ Test 1 passed")
- except Exception as e:
- print(f"✗ Test 1 failed: {e}")
-
- # Test 2: Specific sheet filter
- sheet_names = list(comparator.data.keys())
- if len(sheet_names) > 1:
- second_sheet = sheet_names[1]
- print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===")
- try:
- summary2 = comparator.get_comparison_summary(second_sheet)
- print(f"Selected sheet: {summary2['current_sheet_filter']}")
- print(f"KST total: {summary2['original_counts']['kst_total']}")
- print(f"Coordi total: {summary2['original_counts']['coordi_total']}")
- print(f"Matched: {summary2['matched_items_count']}")
- print("✓ Test 2 passed")
- except Exception as e:
- print(f"✗ Test 2 failed: {e}")
- else:
- print("\n=== TEST 2: Skipped (only one sheet available) ===")
-
- # Test 3: Verify no duplicates across sheets (this was the original problem)
- print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===")
- for sheet_name in sheet_names:
- summary = comparator.get_comparison_summary(sheet_name)
- print(f"Sheet '{sheet_name}':")
- print(f" KST duplicates: {summary['mismatches']['kst_duplicates_count']}")
- print(f" Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}")
-
- print("\n✓ All tests completed!")
-
-if __name__ == "__main__":
- test_sheet_filtering()
\ No newline at end of file
diff --git a/web_gui.py b/web_gui.py
index 0510d9f..0e5a547 100644
--- a/web_gui.py
+++ b/web_gui.py
@@ -61,6 +61,10 @@ def analyze_data():
# Add matched data to results
comparison_results['matched_data'] = matched_items_data
+ # Generate visualize data
+ visualize_data = comparator_instance.generate_visualize_data(sheet_filter)
+ comparison_results['visualize_data'] = visualize_data
+
return jsonify({
'success': True,
'results': comparison_results
@@ -307,6 +311,32 @@ def create_templates_dir():
border: 1px solid #ddd;
border-radius: 4px;
}
+
+ /* Vibrant color styles for Visualize tab */
+ .coordi-only-row {
+ background-color: #ff4444 !important; /* Bright red */
+ color: white;
+ }
+
+ .kst-only-row {
+ background-color: #4488ff !important; /* Bright blue */
+ color: white;
+ }
+
+ .mixed-duplicate-row {
+ background-color: #ff8800 !important; /* Bright orange */
+ color: white;
+ }
+
+ .pure-duplicate-row {
+ background-color: #8844ff !important; /* Bright purple */
+ color: white;
+ }
+
+ .matched-row {
+ background-color: white !important; /* White background */
+ color: black;
+ }
@@ -337,6 +367,7 @@ def create_templates_dir():
Summary
Different
+
Visualize
@@ -389,6 +420,25 @@ def create_templates_dir():
+
+
+
Data
+
+
+
+
+ | Coordi Title |
+ Coordi Chapter |
+ KST Title |
+ KST Chapter |
+ Status |
+
+
+
+
+
+
+
@@ -602,6 +652,9 @@ def create_templates_dir():
// Update Different tab
updateDifferentTable(results.mismatch_details);
+
+ // Update Visualize tab
+ updateVisualizeTable(results.visualize_data);
}
function updateSummaryTable(matchedData) {
@@ -721,6 +774,40 @@ def create_templates_dir():
});
}
+ function updateVisualizeTable(visualizeData) {
+ const tbody = document.getElementById('visualize-table-body');
+ tbody.innerHTML = '';
+
+ // Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
+ visualizeData.forEach(row => {
+ const tr = tbody.insertRow();
+ tr.insertCell(0).textContent = row.coordi_title || '';
+ tr.insertCell(1).textContent = row.coordi_chapter || '';
+ tr.insertCell(2).textContent = row.kst_title || '';
+ tr.insertCell(3).textContent = row.kst_chapter || '';
+ tr.insertCell(4).textContent = row.reason || '';
+
+ // Apply vibrant color highlighting based on row type
+ switch (row.row_type) {
+ case 'coordi_only':
+ tr.className = 'coordi-only-row';
+ break;
+ case 'kst_only':
+ tr.className = 'kst-only-row';
+ break;
+ case 'mixed_duplicate':
+ tr.className = 'mixed-duplicate-row';
+ break;
+ case 'pure_duplicate':
+ tr.className = 'pure-duplicate-row';
+ break;
+ case 'matched':
+ tr.className = 'matched-row';
+ break;
+ }
+ });
+ }
+
// Auto-analyze on page load with default file
window.onload = function() {
// Initialize sheet filter with loading state