diff --git a/CHANGES_SUMMARY.md b/CHANGES_SUMMARY.md new file mode 100644 index 0000000..50b436e --- /dev/null +++ b/CHANGES_SUMMARY.md @@ -0,0 +1,82 @@ +# Changes Summary - Data Comparison Logic Fix + +## Issues Fixed + +### 1. Removed All-Sheet Functionality +- **Problem**: The tool was processing all sheets together, causing cross-sheet duplicate detection +- **Solution**: Completely removed all-sheet functionality, now only processes one sheet at a time +- **Changes**: + - Replaced `extract_kst_coordi_items()` with `extract_kst_coordi_items_for_sheet(sheet_name)` + - Updated all comparison methods to work sheet-specifically + +### 2. Fixed Duplicate Detection Logic +- **Problem**: Items appearing once on each side were incorrectly marked as duplicates +- **Solution**: Fixed `_find_duplicates_in_list()` to only return items that actually appear multiple times +- **Changes**: Used `Counter` to count occurrences and only return items with count > 1 + +### 3. Implemented Mixed Duplicate Priority +- **Problem**: Items showing as both pure duplicates and mixed duplicates +- **Solution**: Mixed duplicates (items in both datasets with duplicates on one side) now take priority +- **Changes**: Generate mixed duplicates first, then exclude those keys from pure duplicate lists + +### 4. Sheet-Specific Analysis Only +- **Problem**: Cross-sheet contamination in duplicate detection +- **Solution**: All analysis now happens within a single sheet context +- **Changes**: + - `get_comparison_summary()` now requires sheet filter and defaults to first sheet + - Removed old filtering methods, replaced with sheet-specific extraction + +## BA Confirmed Cases - All Working ✅ + +### US URGENT Sheet +- ✅ `금수의 영역 - Episode 17` → Coordi duplicate +- ✅ `신결 - Episode 23` → Coordi duplicate +- ✅ `트윈 가이드 - Episode 31` → Mixed duplicate (exists in both, duplicates in Coordi) +- ✅ No longer shows `트윈 가이드 - Episode 31` as pure Coordi duplicate + +### TH URGENT Sheet +- ✅ `백라이트 - Episode 53-1x(휴재)` → KST duplicate (doesn't appear in Coordi) + +## Code Changes Made + +### data_comparator.py +1. **New Methods**: + - `extract_kst_coordi_items_for_sheet(sheet_name)` - Sheet-specific extraction + - `categorize_mismatches_for_sheet(sheet_data)` - Sheet-specific categorization + - `generate_mismatch_details_for_sheet()` - Sheet-specific mismatch details with priority logic + - `group_by_title_for_sheet()` - Sheet-specific grouping + +2. **Updated Methods**: + - `_find_duplicates_in_list()` - Fixed to only return actual duplicates + - `get_comparison_summary()` - Now sheet-specific only + - `print_comparison_summary()` - Added sheet name to output + +3. **Removed Methods**: + - `extract_kst_coordi_items()` - Replaced with sheet-specific version + - `categorize_mismatches()` - Replaced with sheet-specific version + - `generate_mismatch_details()` - Replaced with sheet-specific version + - `group_by_title()` - Replaced with sheet-specific version + - `filter_by_sheet()` - No longer needed + - `filter_grouped_data_by_sheet()` - No longer needed + - `calculate_filtered_counts()` - No longer needed + +### web_gui.py +- Updated matched items extraction to use new grouped data structure +- Removed dependency on old `categorize_mismatches()` method + +### Test Files +- `test_ba_confirmed_cases.py` - New test to verify BA confirmed expectations +- `test_sheet_filtering.py` - Updated to work with new sheet-specific logic + +## Performance Improvements +- Faster analysis since no cross-sheet processing +- More accurate duplicate detection +- Cleaner separation of concerns between sheets + +## Verification +All tests pass: +- ✅ Sheet filtering works correctly +- ✅ Duplicate detection is accurate +- ✅ BA confirmed cases match expectations +- ✅ Web interface works properly +- ✅ Mixed duplicates take priority over pure duplicates \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 97532ec..40586fc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,7 +53,14 @@ The project uses Python 3.13+ with uv for dependency management. Dependencies in ## Comparison Logic The tool compares Excel data by: -1. Finding columns by header names (not positions) -2. Extracting title+episode combinations from both datasets -3. Categorizing mismatches and calculating reconciliation -4. Displaying results with reasons for each discrepancy \ No newline at end of file +1. **Sheet-specific analysis only** - No more "All Sheets" functionality, each sheet is analyzed independently +2. Finding columns by header names (not positions) +3. Extracting title+episode combinations from both datasets within the selected sheet +4. **Fixed duplicate detection** - Only items that appear multiple times within the same dataset are marked as duplicates +5. **Mixed duplicate priority** - Items that exist in both datasets but have duplicates on one side are prioritized over pure duplicates +6. Categorizing mismatches and calculating reconciliation +7. Displaying results with reasons for each discrepancy + +### BA Confirmed Cases +- **US URGENT**: `금수의 영역 - Episode 17`, `신결 - Episode 23` (Coordi duplicates), `트윈 가이드 - Episode 31` (mixed duplicate) +- **TH URGENT**: `백라이트 - Episode 53-1x(휴재)` (KST duplicate, doesn't appear in Coordi) \ No newline at end of file diff --git a/data_comparator.py b/data_comparator.py index e5fb884..9d79ecb 100644 --- a/data_comparator.py +++ b/data_comparator.py @@ -42,8 +42,14 @@ class KSTCoordiComparator: print(f"Error loading data: {e}") return False - def extract_kst_coordi_items(self) -> Dict[str, Any]: - """Extract KST and Coordi items from all sheets using column header names""" + def extract_kst_coordi_items_for_sheet(self, sheet_name: str) -> Dict[str, Any]: + """Extract KST and Coordi items from a specific sheet using column header names""" + if sheet_name not in self.data: + raise ValueError(f"Sheet '{sheet_name}' not found in data") + + df = self.data[sheet_name] + columns = df.columns.tolist() + kst_items = set() coordi_items = set() kst_details = [] @@ -51,96 +57,88 @@ class KSTCoordiComparator: kst_all_items = [] # Keep all items including duplicates coordi_all_items = [] # Keep all items including duplicates - for sheet_name, df in self.data.items(): - columns = df.columns.tolist() - - # Find columns by header names - # KST columns: 'Title KR' and 'Epi.' - # Coordi columns: 'KR title' and 'Chap' - - kst_title_col = None - kst_episode_col = None - coordi_title_col = None - coordi_episode_col = None - - # Find KST columns - for col in columns: - if col == 'Title KR': - kst_title_col = col - elif col == 'Epi.': - kst_episode_col = col - - # Find Coordi columns - for col in columns: - if col == 'KR title': - coordi_title_col = col - elif col == 'Chap': - coordi_episode_col = col - - print(f"Sheet: {sheet_name}") - print(f" KST columns - Title: {kst_title_col}, Episode: {kst_episode_col}") - print(f" Coordi columns - Title: {coordi_title_col}, Episode: {coordi_episode_col}") - - # Extract items from each row - for idx, row in df.iterrows(): - # Extract KST data - if kst_title_col and kst_episode_col: - kst_title = str(row.get(kst_title_col, '')).strip() - kst_episode = str(row.get(kst_episode_col, '')).strip() - - # Check if this row has valid KST data - has_kst_data = ( - kst_title and kst_title != 'nan' and - kst_episode and kst_episode != 'nan' and - pd.notna(row[kst_title_col]) and pd.notna(row[kst_episode_col]) - ) - - if has_kst_data: - item = ComparisonItem(kst_title, kst_episode, sheet_name, idx) - kst_items.add(item) - kst_all_items.append(item) # Keep all items for duplicate detection - kst_details.append({ - 'title': kst_title, - 'episode': kst_episode, - 'sheet': sheet_name, - 'row_index': idx, - 'kst_data': { - kst_title_col: row[kst_title_col], - kst_episode_col: row[kst_episode_col] - } - }) - - # Extract Coordi data - if coordi_title_col and coordi_episode_col: - coordi_title = str(row.get(coordi_title_col, '')).strip() - coordi_episode = str(row.get(coordi_episode_col, '')).strip() - - # Check if this row has valid Coordi data - has_coordi_data = ( - coordi_title and coordi_title != 'nan' and - coordi_episode and coordi_episode != 'nan' and - pd.notna(row[coordi_title_col]) and pd.notna(row[coordi_episode_col]) - ) - - if has_coordi_data: - item = ComparisonItem(coordi_title, coordi_episode, sheet_name, idx) - coordi_items.add(item) - coordi_all_items.append(item) # Keep all items for duplicate detection - coordi_details.append({ - 'title': coordi_title, - 'episode': coordi_episode, - 'sheet': sheet_name, - 'row_index': idx, - 'coordi_data': { - coordi_title_col: row[coordi_title_col], - coordi_episode_col: row[coordi_episode_col] - } - }) + # Find columns by header names + # KST columns: 'Title KR' and 'Epi.' + # Coordi columns: 'KR title' and 'Chap' - self.kst_items = kst_items - self.coordi_items = coordi_items - self.kst_all_items = kst_all_items # Store for duplicate detection - self.coordi_all_items = coordi_all_items # Store for duplicate detection + kst_title_col = None + kst_episode_col = None + coordi_title_col = None + coordi_episode_col = None + + # Find KST columns + for col in columns: + if col == 'Title KR': + kst_title_col = col + elif col == 'Epi.': + kst_episode_col = col + + # Find Coordi columns + for col in columns: + if col == 'KR title': + coordi_title_col = col + elif col == 'Chap': + coordi_episode_col = col + + print(f"Sheet: {sheet_name}") + print(f" KST columns - Title: {kst_title_col}, Episode: {kst_episode_col}") + print(f" Coordi columns - Title: {coordi_title_col}, Episode: {coordi_episode_col}") + + # Extract items from each row + for idx, row in df.iterrows(): + # Extract KST data + if kst_title_col and kst_episode_col: + kst_title = str(row.get(kst_title_col, '')).strip() + kst_episode = str(row.get(kst_episode_col, '')).strip() + + # Check if this row has valid KST data + has_kst_data = ( + kst_title and kst_title != 'nan' and + kst_episode and kst_episode != 'nan' and + pd.notna(row[kst_title_col]) and pd.notna(row[kst_episode_col]) + ) + + if has_kst_data: + item = ComparisonItem(kst_title, kst_episode, sheet_name, idx) + kst_items.add(item) + kst_all_items.append(item) # Keep all items for duplicate detection + kst_details.append({ + 'title': kst_title, + 'episode': kst_episode, + 'sheet': sheet_name, + 'row_index': idx, + 'kst_data': { + kst_title_col: row[kst_title_col], + kst_episode_col: row[kst_episode_col] + } + }) + + # Extract Coordi data + if coordi_title_col and coordi_episode_col: + coordi_title = str(row.get(coordi_title_col, '')).strip() + coordi_episode = str(row.get(coordi_episode_col, '')).strip() + + # Check if this row has valid Coordi data + has_coordi_data = ( + coordi_title and coordi_title != 'nan' and + coordi_episode and coordi_episode != 'nan' and + pd.notna(row[coordi_title_col]) and pd.notna(row[coordi_episode_col]) + ) + + if has_coordi_data: + item = ComparisonItem(coordi_title, coordi_episode, sheet_name, idx) + coordi_items.add(item) + coordi_all_items.append(item) # Keep all items for duplicate detection + coordi_details.append({ + 'title': coordi_title, + 'episode': coordi_episode, + 'sheet': sheet_name, + 'row_index': idx, + 'coordi_data': { + coordi_title_col: row[coordi_title_col], + coordi_episode_col: row[coordi_episode_col] + } + }) return { 'kst_items': kst_items, @@ -151,19 +149,21 @@ class KSTCoordiComparator: 'coordi_all_items': coordi_all_items } - def categorize_mismatches(self) -> Dict[str, Any]: - """Categorize data into KST-only, Coordi-only, and matched items""" - if not self.kst_items or not self.coordi_items: - self.extract_kst_coordi_items() + def categorize_mismatches_for_sheet(self, sheet_data: Dict[str, Any]) -> Dict[str, Any]: + """Categorize data into KST-only, Coordi-only, and matched items for a specific sheet""" + kst_items = sheet_data['kst_items'] + coordi_items = sheet_data['coordi_items'] + kst_all_items = sheet_data['kst_all_items'] + coordi_all_items = sheet_data['coordi_all_items'] # Find overlaps and differences - matched_items = self.kst_items.intersection(self.coordi_items) - kst_only_items = self.kst_items - self.coordi_items - coordi_only_items = self.coordi_items - self.kst_items + matched_items = kst_items.intersection(coordi_items) + kst_only_items = kst_items - coordi_items + coordi_only_items = coordi_items - kst_items - # Find duplicates within each dataset - kst_duplicates = self._find_duplicates_in_list(self.kst_all_items) - coordi_duplicates = self._find_duplicates_in_list(self.coordi_all_items) + # Find duplicates within each dataset - FIXED LOGIC + kst_duplicates = self._find_duplicates_in_list(kst_all_items) + coordi_duplicates = self._find_duplicates_in_list(coordi_all_items) categorization = { 'matched_items': list(matched_items), @@ -172,8 +172,8 @@ class KSTCoordiComparator: 'kst_duplicates': kst_duplicates, 'coordi_duplicates': coordi_duplicates, 'counts': { - 'total_kst': len(self.kst_items), - 'total_coordi': len(self.coordi_items), + 'total_kst': len(kst_items), + 'total_coordi': len(coordi_items), 'matched': len(matched_items), 'kst_only': len(kst_only_items), 'coordi_only': len(coordi_only_items), @@ -187,8 +187,8 @@ class KSTCoordiComparator: reconciled_coordi_count = len(matched_items) categorization['reconciliation'] = { - 'original_kst_count': len(self.kst_items), - 'original_coordi_count': len(self.coordi_items), + 'original_kst_count': len(kst_items), + 'original_coordi_count': len(coordi_items), 'reconciled_kst_count': reconciled_kst_count, 'reconciled_coordi_count': reconciled_coordi_count, 'counts_match_after_reconciliation': reconciled_kst_count == reconciled_coordi_count, @@ -199,30 +199,27 @@ class KSTCoordiComparator: return categorization def _find_duplicates_in_list(self, items_list: List[ComparisonItem]) -> List[ComparisonItem]: - """Find duplicate items within a dataset""" - seen = set() - duplicates = [] + """Find duplicate items within a dataset - FIXED to only return actual duplicates""" + from collections import Counter + # Count occurrences of each (title, episode) pair + key_counts = Counter((item.title, item.episode) for item in items_list) + + # Only return items that appear more than once + duplicates = [] for item in items_list: key = (item.title, item.episode) - if key in seen: + if key_counts[key] > 1: duplicates.append(item) - else: - seen.add(key) return duplicates - def _find_sheet_specific_mixed_duplicates(self, sheet_filter: str) -> List[Dict]: + def _find_sheet_specific_mixed_duplicates(self, sheet_data: Dict[str, Any], sheet_filter: str) -> List[Dict]: """Find mixed duplicates within a specific sheet only""" - if not sheet_filter: - return [] - mixed_duplicates = [] - # Extract items specific to this sheet - extract_results = self.extract_kst_coordi_items() - kst_sheet_items = [item for item in extract_results['kst_all_items'] if item.source_sheet == sheet_filter] - coordi_sheet_items = [item for item in extract_results['coordi_all_items'] if item.source_sheet == sheet_filter] + kst_sheet_items = sheet_data['kst_all_items'] + coordi_sheet_items = sheet_data['coordi_all_items'] # Find duplicates within this sheet kst_sheet_duplicates = self._find_duplicates_in_list(kst_sheet_items) @@ -265,10 +262,8 @@ class KSTCoordiComparator: return mixed_duplicates - def generate_mismatch_details(self) -> Dict[str, List[Dict]]: - """Generate detailed information about each type of mismatch with reasons""" - categorization = self.categorize_mismatches() - + def generate_mismatch_details_for_sheet(self, categorization: Dict[str, Any], sheet_data: Dict[str, Any], sheet_filter: str) -> Dict[str, List[Dict]]: + """Generate detailed information about each type of mismatch with reasons for a specific sheet""" mismatch_details = { 'kst_only': [], 'coordi_only': [], @@ -299,35 +294,43 @@ class KSTCoordiComparator: 'mismatch_type': 'COORDI_ONLY' }) - # KST duplicates + # Find mixed duplicates first (they take priority) + mixed_duplicates = self._find_sheet_specific_mixed_duplicates(sheet_data, sheet_filter) + mismatch_details['mixed_duplicates'] = mixed_duplicates + + # Create set of items that are already covered by mixed duplicates + mixed_duplicate_keys = {(item['title'], item['episode']) for item in mixed_duplicates} + + # KST duplicates - exclude those already covered by mixed duplicates for item in categorization['kst_duplicates']: - mismatch_details['kst_duplicates'].append({ - 'title': item.title, - 'episode': item.episode, - 'sheet': item.source_sheet, - 'row_index': item.row_index, - 'reason': 'Duplicate entry in KST data', - 'mismatch_type': 'KST_DUPLICATE' - }) + key = (item.title, item.episode) + if key not in mixed_duplicate_keys: + mismatch_details['kst_duplicates'].append({ + 'title': item.title, + 'episode': item.episode, + 'sheet': item.source_sheet, + 'row_index': item.row_index, + 'reason': 'Duplicate entry in KST data', + 'mismatch_type': 'KST_DUPLICATE' + }) - # Coordi duplicates + # Coordi duplicates - exclude those already covered by mixed duplicates for item in categorization['coordi_duplicates']: - mismatch_details['coordi_duplicates'].append({ - 'title': item.title, - 'episode': item.episode, - 'sheet': item.source_sheet, - 'row_index': item.row_index, - 'reason': 'Duplicate entry in Coordi data', - 'mismatch_type': 'COORDI_DUPLICATE' - }) - - # Mixed duplicates will be calculated per sheet in get_comparison_summary - mismatch_details['mixed_duplicates'] = [] + key = (item.title, item.episode) + if key not in mixed_duplicate_keys: + mismatch_details['coordi_duplicates'].append({ + 'title': item.title, + 'episode': item.episode, + 'sheet': item.source_sheet, + 'row_index': item.row_index, + 'reason': 'Duplicate entry in Coordi data', + 'mismatch_type': 'COORDI_DUPLICATE' + }) return mismatch_details def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]: - """Get a comprehensive summary of the comparison, filtered by a specific sheet""" + """Get a comprehensive summary of the comparison for a specific sheet only""" # Get sheet names for filtering options sheet_names = list(self.data.keys()) if self.data else [] @@ -338,33 +341,37 @@ class KSTCoordiComparator: if not sheet_filter: raise ValueError("No sheets available or sheet filter not specified") - categorization = self.categorize_mismatches() - mismatch_details = self.generate_mismatch_details() - grouped_data = self.group_by_title() + # Extract data for the specific sheet only + sheet_data = self.extract_kst_coordi_items_for_sheet(sheet_filter) - # Always apply sheet filtering (no more "All Sheets" option) - mismatch_details = self.filter_by_sheet(mismatch_details, sheet_filter) - grouped_data = self.filter_grouped_data_by_sheet(grouped_data, sheet_filter) + # Categorize mismatches for this sheet + categorization = self.categorize_mismatches_for_sheet(sheet_data) - # Calculate mixed duplicates specific to this sheet - mismatch_details['mixed_duplicates'] = self._find_sheet_specific_mixed_duplicates(sheet_filter) + # Generate mismatch details for this sheet + mismatch_details = self.generate_mismatch_details_for_sheet(categorization, sheet_data, sheet_filter) - # Recalculate counts for filtered data - filtered_counts = self.calculate_filtered_counts(mismatch_details) + # Group data by title for this sheet + grouped_data = self.group_by_title_for_sheet(categorization, sheet_filter) + + # Calculate counts + matched_count = len(categorization['matched_items']) + kst_total = len(sheet_data['kst_items']) + coordi_total = len(sheet_data['coordi_items']) summary = { 'sheet_names': sheet_names, 'current_sheet_filter': sheet_filter, 'original_counts': { - 'kst_total': filtered_counts['kst_total'], - 'coordi_total': filtered_counts['coordi_total'] + 'kst_total': kst_total, + 'coordi_total': coordi_total }, - 'matched_items_count': filtered_counts['matched'], + 'matched_items_count': matched_count, 'mismatches': { - 'kst_only_count': filtered_counts['kst_only_count'], - 'coordi_only_count': filtered_counts['coordi_only_count'], - 'kst_duplicates_count': filtered_counts['kst_duplicates_count'], - 'coordi_duplicates_count': filtered_counts['coordi_duplicates_count'] + 'kst_only_count': len(mismatch_details['kst_only']), + 'coordi_only_count': len(mismatch_details['coordi_only']), + 'kst_duplicates_count': len(mismatch_details['kst_duplicates']), + 'coordi_duplicates_count': len(mismatch_details['coordi_duplicates']), + 'mixed_duplicates_count': len(mismatch_details['mixed_duplicates']) }, 'reconciliation': categorization['reconciliation'], 'mismatch_details': mismatch_details, @@ -373,67 +380,8 @@ class KSTCoordiComparator: return summary - def filter_by_sheet(self, mismatch_details: Dict[str, List], sheet_filter: str) -> Dict[str, List]: - """Filter mismatch details by specific sheet""" - filtered = {} - for category, items in mismatch_details.items(): - filtered[category] = [item for item in items if item.get('sheet') == sheet_filter] - return filtered - - def filter_grouped_data_by_sheet(self, grouped_data: Dict, sheet_filter: str) -> Dict: - """Filter grouped data by specific sheet""" - filtered = { - 'kst_only_by_title': {}, - 'coordi_only_by_title': {}, - 'matched_by_title': {}, - 'title_summaries': {} - } - - # Filter each category - for category in ['kst_only_by_title', 'coordi_only_by_title', 'matched_by_title']: - for title, items in grouped_data[category].items(): - filtered_items = [item for item in items if item.get('sheet') == sheet_filter] - if filtered_items: - filtered[category][title] = filtered_items - - # Recalculate title summaries for filtered data - all_titles = set() - all_titles.update(filtered['kst_only_by_title'].keys()) - all_titles.update(filtered['coordi_only_by_title'].keys()) - all_titles.update(filtered['matched_by_title'].keys()) - - for title in all_titles: - kst_only_count = len(filtered['kst_only_by_title'].get(title, [])) - coordi_only_count = len(filtered['coordi_only_by_title'].get(title, [])) - matched_count = len(filtered['matched_by_title'].get(title, [])) - total_episodes = kst_only_count + coordi_only_count + matched_count - - filtered['title_summaries'][title] = { - 'total_episodes': total_episodes, - 'matched_count': matched_count, - 'kst_only_count': kst_only_count, - 'coordi_only_count': coordi_only_count, - 'match_percentage': round((matched_count / total_episodes * 100) if total_episodes > 0 else 0, 1), - 'has_mismatches': kst_only_count > 0 or coordi_only_count > 0 - } - - return filtered - - def calculate_filtered_counts(self, filtered_mismatch_details: Dict[str, List]) -> Dict[str, int]: - """Calculate counts for filtered data""" - return { - 'kst_total': len(filtered_mismatch_details['kst_only']) + len(filtered_mismatch_details['kst_duplicates']), - 'coordi_total': len(filtered_mismatch_details['coordi_only']) + len(filtered_mismatch_details['coordi_duplicates']), - 'matched': 0, # Will be calculated from matched data separately - 'kst_only_count': len(filtered_mismatch_details['kst_only']), - 'coordi_only_count': len(filtered_mismatch_details['coordi_only']), - 'kst_duplicates_count': len(filtered_mismatch_details['kst_duplicates']), - 'coordi_duplicates_count': len(filtered_mismatch_details['coordi_duplicates']), - 'mixed_duplicates_count': len(filtered_mismatch_details.get('mixed_duplicates', [])) - } - - def group_by_title(self) -> Dict[str, Any]: - """Group mismatches and matches by KR title""" + def group_by_title_for_sheet(self, categorization: Dict[str, Any], sheet_filter: str) -> Dict[str, Any]: + """Group mismatches and matches by KR title for a specific sheet""" from collections import defaultdict grouped = { @@ -443,33 +391,38 @@ class KSTCoordiComparator: 'title_summaries': {} } - # Get mismatch details - mismatch_details = self.generate_mismatch_details() - # Group KST only items by title - for item in mismatch_details['kst_only']: - title = item['title'] - grouped['kst_only_by_title'][title].append(item) + for item in categorization['kst_only_items']: + title = item.title + grouped['kst_only_by_title'][title].append({ + 'title': item.title, + 'episode': item.episode, + 'sheet': item.source_sheet, + 'row_index': item.row_index, + 'reason': 'Item exists in KST data but not in Coordi data' + }) # Group Coordi only items by title - for item in mismatch_details['coordi_only']: - title = item['title'] - grouped['coordi_only_by_title'][title].append(item) + for item in categorization['coordi_only_items']: + title = item.title + grouped['coordi_only_by_title'][title].append({ + 'title': item.title, + 'episode': item.episode, + 'sheet': item.source_sheet, + 'row_index': item.row_index, + 'reason': 'Item exists in Coordi data but not in KST data' + }) # Group matched items by title - if hasattr(self, 'kst_items') and hasattr(self, 'coordi_items'): - categorization = self.categorize_mismatches() - matched_items = categorization['matched_items'] - - for item in matched_items: - title = item.title - grouped['matched_by_title'][title].append({ - 'title': item.title, - 'episode': item.episode, - 'sheet': item.source_sheet, - 'row_index': item.row_index, - 'reason': 'Perfect match' - }) + for item in categorization['matched_items']: + title = item.title + grouped['matched_by_title'][title].append({ + 'title': item.title, + 'episode': item.episode, + 'sheet': item.source_sheet, + 'row_index': item.row_index, + 'reason': 'Perfect match' + }) # Create summary for each title all_titles = set() @@ -499,12 +452,14 @@ class KSTCoordiComparator: return grouped - def print_comparison_summary(self): - """Print a formatted summary of the comparison""" - summary = self.get_comparison_summary() + + + def print_comparison_summary(self, sheet_filter: str = None): + """Print a formatted summary of the comparison for a specific sheet""" + summary = self.get_comparison_summary(sheet_filter) print("=" * 80) - print("KST vs COORDI COMPARISON SUMMARY") + print(f"KST vs COORDI COMPARISON SUMMARY - Sheet: {summary['current_sheet_filter']}") print("=" * 80) print(f"Original Counts:") @@ -520,6 +475,7 @@ class KSTCoordiComparator: print(f" Coordi Only: {summary['mismatches']['coordi_only_count']}") print(f" KST Duplicates: {summary['mismatches']['kst_duplicates_count']}") print(f" Coordi Duplicates: {summary['mismatches']['coordi_duplicates_count']}") + print(f" Mixed Duplicates: {summary['mismatches']['mixed_duplicates_count']}") print() print(f"Reconciliation:") diff --git a/templates/index.html b/templates/index.html index c7404e6..b3d2b00 100644 --- a/templates/index.html +++ b/templates/index.html @@ -104,7 +104,17 @@ } .summary-card h3 { margin-top: 0; + margin-bottom: 15px; color: #333; + font-size: 1.1em; + } + .summary-card p { + margin: 8px 0; + color: #555; + } + .summary-card span { + font-weight: bold; + color: #007bff; } .count-badge { display: inline-block; @@ -196,6 +206,22 @@
Current Sheet: -
+Matched Items: 0 (Same in both KST and Coordi)
+Different Items: 0 (Total tasks excluding matched items)
+KST Only: 0
+Coordi Only: 0
+Duplicates: 0
+