2025-08-21 08:48:58 +00:00
3 changed files with 332 additions and 3 deletions
--- a/data_comparator.py
+++ b/data_comparator.py
@ -3,7 +3,25 @@ import numpy as np
 from typing import Dict, List, Tuple, Any
 from dataclasses import dataclass

-@dataclass
+def normalize_episode(episode: str) -> str:
+    """Normalize episode numbers to handle cases like '54' vs '54.0'"""
+    if not episode or episode.strip() == '':
+        return episode
+    
+    try:
+        # Convert to float first to handle both int and float formats
+        episode_float = float(episode.strip())
+        
+        # If it's a whole number (like 54.0), convert to int format
+        if episode_float.is_integer():
+            return str(int(episode_float))
+        else:
+            # Keep decimal format for non-whole numbers
+            return str(episode_float)
+    except (ValueError, TypeError):
+        # If conversion fails, return original episode string
+        return episode.strip()
+
 class ComparisonItem:
    """Represents a single item for comparison"""
    title: str
@ -11,6 +29,12 @@ class ComparisonItem:
    source_sheet: str
    row_index: int
    
+    def __init__(self, title: str, episode: str, source_sheet: str, row_index: int):
+        self.title = title
+        self.episode = normalize_episode(episode)  # Normalize episode on creation
+        self.source_sheet = source_sheet
+        self.row_index = row_index
+    
    def __hash__(self):
        return hash((self.title, self.episode))
    
@ -340,7 +364,7 @@ class KSTCoordiComparator:
        
        return mismatch_details
    
-    def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]:
+    def get_comparison_summary(self, sheet_filter: str | None = None) -> Dict[str, Any]:
        """Get a comprehensive summary of the comparison for a specific sheet only"""
        # Get sheet names for filtering options
        sheet_names = list(self.data.keys()) if self.data else []
@ -467,9 +491,144 @@ class KSTCoordiComparator:
        
        return grouped
    
+    def generate_visualize_data(self, sheet_filter: str | None = None) -> List[Dict[str, Any]]:
+        """Generate data structure for Excel-like visualization"""
+        # Get comparison data for the specified sheet
+        summary = self.get_comparison_summary(sheet_filter)
+        mismatch_details = summary['mismatch_details']
+        
+        visualize_rows = []
+        
+        # Helper function to create a row
+        def create_row(coordi_title="", coordi_chapter="", kst_title="", kst_chapter="", 
+                      row_type="matched", reason="", title_for_sort=""):
+            return {
+                'coordi_title': coordi_title,
+                'coordi_chapter': coordi_chapter,
+                'kst_title': kst_title,
+                'kst_chapter': kst_chapter,
+                'row_type': row_type,
+                'reason': reason,
+                'title_for_sort': title_for_sort or coordi_title or kst_title,
+                'priority': 1 if row_type != 'matched' else 2  # Mismatches first
+            }
+        
+        # 1. Handle Coordi-only items
+        for item in mismatch_details['coordi_only']:
+            visualize_rows.append(create_row(
+                coordi_title=item['title'],
+                coordi_chapter=item['episode'],
+                row_type='coordi_only',
+                reason='Only in Coordi'
+            ))
+        
+        # 2. Handle KST-only items  
+        for item in mismatch_details['kst_only']:
+            visualize_rows.append(create_row(
+                kst_title=item['title'],
+                kst_chapter=item['episode'],
+                row_type='kst_only',
+                reason='Only in KST'
+            ))
+        
+        # 3. Handle Mixed duplicates (exists in both but duplicated on one side)
+        mixed_items = {}  # Group by title+episode
+        for item in mismatch_details['mixed_duplicates']:
+            key = f"{item['title']}_{item['episode']}"
+            if key not in mixed_items:
+                mixed_items[key] = {
+                    'title': item['title'],
+                    'episode': item['episode'],
+                    'has_kst_duplicate': False,
+                    'has_coordi_duplicate': False
+                }
+            
+            if item['duplicate_side'] == 'KST':
+                mixed_items[key]['has_kst_duplicate'] = True
+            elif item['duplicate_side'] == 'COORDI':
+                mixed_items[key]['has_coordi_duplicate'] = True
+        
+        for key, item in mixed_items.items():
+            # First row: show it exists in both
+            visualize_rows.append(create_row(
+                coordi_title=item['title'],
+                coordi_chapter=item['episode'],
+                kst_title=item['title'],
+                kst_chapter=item['episode'],
+                row_type='mixed_duplicate',
+                reason='Mixed duplicate'
+            ))
+            
+            # Additional rows for duplicates
+            if item['has_kst_duplicate']:
+                visualize_rows.append(create_row(
+                    kst_title=item['title'],
+                    kst_chapter=item['episode'],
+                    row_type='mixed_duplicate',
+                    reason='Duplicate in KST',
+                    title_for_sort=item['title']
+                ))
+            
+            if item['has_coordi_duplicate']:
+                visualize_rows.append(create_row(
+                    coordi_title=item['title'],
+                    coordi_chapter=item['episode'],
+                    row_type='mixed_duplicate', 
+                    reason='Duplicate in Coordi',
+                    title_for_sort=item['title']
+                ))
+        
+        # 4. Handle Pure duplicates
+        for item in mismatch_details['kst_duplicates']:
+            visualize_rows.append(create_row(
+                kst_title=item['title'],
+                kst_chapter=item['episode'],
+                row_type='pure_duplicate',
+                reason='Duplicate in KST'
+            ))
+        
+        for item in mismatch_details['coordi_duplicates']:
+            visualize_rows.append(create_row(
+                coordi_title=item['title'],
+                coordi_chapter=item['episode'],
+                row_type='pure_duplicate',
+                reason='Duplicate in Coordi'
+            ))
+        
+        # 5. Handle Matched items (perfect matches)
+        matched_by_title = summary['grouped_by_title']['matched_by_title']
+        for title, items in matched_by_title.items():
+            for item in items:
+                visualize_rows.append(create_row(
+                    coordi_title=item['title'],
+                    coordi_chapter=item['episode'],
+                    kst_title=item['title'],
+                    kst_chapter=item['episode'],
+                    row_type='matched',
+                    reason='Perfect match'
+                ))
+        
+        # Sort: Mismatches first (priority 1), then matches (priority 2), then by Korean title + chapter
+        def sort_key(x):
+            # Extract episode number for proper numeric sorting
+            coordi_episode = x.get('coordi_chapter', '') or ''
+            kst_episode = x.get('kst_chapter', '') or ''
+            episode = coordi_episode or kst_episode
+            
+            # Try to convert episode to number for proper sorting, fallback to string
+            try:
+                episode_num = float(episode) if episode else 0
+            except (ValueError, TypeError):
+                episode_num = 0
+                
+            return (x['priority'], x['title_for_sort'], episode_num)
+        
+        visualize_rows.sort(key=sort_key)
+        
+        return visualize_rows

    
-    def print_comparison_summary(self, sheet_filter: str = None):
+    def print_comparison_summary(self, sheet_filter: str | None = None):
        """Print a formatted summary of the comparison for a specific sheet"""
        summary = self.get_comparison_summary(sheet_filter)
        
--- a/templates/index.html
+++ b/templates/index.html
@ -173,6 +173,32 @@
            border: 1px solid #ddd;
            border-radius: 4px;
        }
+        
+        /* Vibrant color styles for Visualize tab */
+        .coordi-only-row {
+            background-color: #ff4444 !important; /* Bright red */
+            color: white;
+        }
+        
+        .kst-only-row {
+            background-color: #4488ff !important; /* Bright blue */
+            color: white;
+        }
+        
+        .mixed-duplicate-row {
+            background-color: #ff8800 !important; /* Bright orange */
+            color: white;
+        }
+        
+        .pure-duplicate-row {
+            background-color: #8844ff !important; /* Bright purple */
+            color: white;
+        }
+        
+        .matched-row {
+            background-color: white !important; /* White background */
+            color: black;
+        }
    </style>
 </head>
 <body>
@ -203,6 +229,7 @@
            <div class="tabs">
                <div class="tab active" onclick="showTab('summary')">Summary</div>
                <div class="tab" onclick="showTab('different')">Different</div>
+                <div class="tab" onclick="showTab('visualize')">Visualize</div>
            </div>
            
            <div id="summary" class="tab-content active">
@ -255,6 +282,25 @@
                    </table>
                </div>
            </div>
+            
+            <div id="visualize" class="tab-content">
+                <h3>Data </h3>
+                <div class="table-container">
+                    <table id="visualize-table">
+                        <thead>
+                            <tr>
+                                <th>Coordi Title</th>
+                                <th>Coordi Chapter</th>
+                                <th>KST Title</th>
+                                <th>KST Chapter</th>
+                                <th>Status</th>
+                            </tr>
+                        </thead>
+                        <tbody id="visualize-table-body">
+                        </tbody>
+                    </table>
+                </div>
+            </div>
        </div>
    </div>

@ -468,6 +514,9 @@
            
            // Update Different tab
            updateDifferentTable(results.mismatch_details);
+            
+            // Update Visualize tab
+            updateVisualizeTable(results.visualize_data);
        }
        
        function updateSummaryTable(matchedData) {
@ -587,6 +636,40 @@
            });
        }
        
+        function updateVisualizeTable(visualizeData) {
+            const tbody = document.getElementById('visualize-table-body');
+            tbody.innerHTML = '';
+            
+            // Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
+            visualizeData.forEach(row => {
+                const tr = tbody.insertRow();
+                tr.insertCell(0).textContent = row.coordi_title || '';
+                tr.insertCell(1).textContent = row.coordi_chapter || '';
+                tr.insertCell(2).textContent = row.kst_title || '';
+                tr.insertCell(3).textContent = row.kst_chapter || '';
+                tr.insertCell(4).textContent = row.reason || '';
+                
+                // Apply vibrant color highlighting based on row type
+                switch (row.row_type) {
+                    case 'coordi_only':
+                        tr.className = 'coordi-only-row';
+                        break;
+                    case 'kst_only':
+                        tr.className = 'kst-only-row';
+                        break;
+                    case 'mixed_duplicate':
+                        tr.className = 'mixed-duplicate-row';
+                        break;
+                    case 'pure_duplicate':
+                        tr.className = 'pure-duplicate-row';
+                        break;
+                    case 'matched':
+                        tr.className = 'matched-row';
+                        break;
+                }
+            });
+        }
+        
        // Auto-analyze on page load with default file
        window.onload = function() {
            // Initialize sheet filter with loading state
--- a/web_gui.py
+++ b/web_gui.py
@ -61,6 +61,10 @@ def analyze_data():
        # Add matched data to results
        comparison_results['matched_data'] = matched_items_data
        
+        # Generate visualize data
+        visualize_data = comparator_instance.generate_visualize_data(sheet_filter)
+        comparison_results['visualize_data'] = visualize_data
+        
        return jsonify({
            'success': True,
            'results': comparison_results
@ -307,6 +311,32 @@ def create_templates_dir():
            border: 1px solid #ddd;
            border-radius: 4px;
        }
+        
+        /* Vibrant color styles for Visualize tab */
+        .coordi-only-row {
+            background-color: #ff4444 !important; /* Bright red */
+            color: white;
+        }
+        
+        .kst-only-row {
+            background-color: #4488ff !important; /* Bright blue */
+            color: white;
+        }
+        
+        .mixed-duplicate-row {
+            background-color: #ff8800 !important; /* Bright orange */
+            color: white;
+        }
+        
+        .pure-duplicate-row {
+            background-color: #8844ff !important; /* Bright purple */
+            color: white;
+        }
+        
+        .matched-row {
+            background-color: white !important; /* White background */
+            color: black;
+        }
    </style>
 </head>
 <body>
@ -337,6 +367,7 @@ def create_templates_dir():
            <div class="tabs">
                <div class="tab active" onclick="showTab('summary')">Summary</div>
                <div class="tab" onclick="showTab('different')">Different</div>
+                <div class="tab" onclick="showTab('visualize')">Visualize</div>
            </div>
            
            <div id="summary" class="tab-content active">
@ -389,6 +420,25 @@ def create_templates_dir():
                    </table>
                </div>
            </div>
+            
+            <div id="visualize" class="tab-content">
+                <h3>Data </h3>
+                <div class="table-container">
+                    <table id="visualize-table">
+                        <thead>
+                            <tr>
+                                <th>Coordi Title</th>
+                                <th>Coordi Chapter</th>
+                                <th>KST Title</th>
+                                <th>KST Chapter</th>
+                                <th>Status</th>
+                            </tr>
+                        </thead>
+                        <tbody id="visualize-table-body">
+                        </tbody>
+                    </table>
+                </div>
+            </div>
        </div>
    </div>

@ -602,6 +652,9 @@ def create_templates_dir():
            
            // Update Different tab
            updateDifferentTable(results.mismatch_details);
+            
+            // Update Visualize tab
+            updateVisualizeTable(results.visualize_data);
        }
        
        function updateSummaryTable(matchedData) {
@ -721,6 +774,40 @@ def create_templates_dir():
            });
        }
        
+        function updateVisualizeTable(visualizeData) {
+            const tbody = document.getElementById('visualize-table-body');
+            tbody.innerHTML = '';
+            
+            // Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
+            visualizeData.forEach(row => {
+                const tr = tbody.insertRow();
+                tr.insertCell(0).textContent = row.coordi_title || '';
+                tr.insertCell(1).textContent = row.coordi_chapter || '';
+                tr.insertCell(2).textContent = row.kst_title || '';
+                tr.insertCell(3).textContent = row.kst_chapter || '';
+                tr.insertCell(4).textContent = row.reason || '';
+                
+                // Apply vibrant color highlighting based on row type
+                switch (row.row_type) {
+                    case 'coordi_only':
+                        tr.className = 'coordi-only-row';
+                        break;
+                    case 'kst_only':
+                        tr.className = 'kst-only-row';
+                        break;
+                    case 'mixed_duplicate':
+                        tr.className = 'mixed-duplicate-row';
+                        break;
+                    case 'pure_duplicate':
+                        tr.className = 'pure-duplicate-row';
+                        break;
+                    case 'matched':
+                        tr.className = 'matched-row';
+                        break;
+                }
+            });
+        }
+        
        // Auto-analyze on page load with default file
        window.onload = function() {
            // Initialize sheet filter with loading state