dev/viettran #2

Merged
IDS-Viet merged 2 commits from dev/viettran into main 2025-08-21 08:48:58 +00:00
3 changed files with 332 additions and 3 deletions
Showing only changes of commit 8d0351622b - Show all commits

View File

@ -3,7 +3,25 @@ import numpy as np
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass
@dataclass
def normalize_episode(episode: str) -> str:
"""Normalize episode numbers to handle cases like '54' vs '54.0'"""
if not episode or episode.strip() == '':
return episode
try:
# Convert to float first to handle both int and float formats
episode_float = float(episode.strip())
# If it's a whole number (like 54.0), convert to int format
if episode_float.is_integer():
return str(int(episode_float))
else:
# Keep decimal format for non-whole numbers
return str(episode_float)
except (ValueError, TypeError):
# If conversion fails, return original episode string
return episode.strip()
class ComparisonItem:
"""Represents a single item for comparison"""
title: str
@ -11,6 +29,12 @@ class ComparisonItem:
source_sheet: str
row_index: int
def __init__(self, title: str, episode: str, source_sheet: str, row_index: int):
self.title = title
self.episode = normalize_episode(episode) # Normalize episode on creation
self.source_sheet = source_sheet
self.row_index = row_index
def __hash__(self):
return hash((self.title, self.episode))
@ -340,7 +364,7 @@ class KSTCoordiComparator:
return mismatch_details
def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]:
def get_comparison_summary(self, sheet_filter: str | None = None) -> Dict[str, Any]:
"""Get a comprehensive summary of the comparison for a specific sheet only"""
# Get sheet names for filtering options
sheet_names = list(self.data.keys()) if self.data else []
@ -467,9 +491,144 @@ class KSTCoordiComparator:
return grouped
def generate_visualize_data(self, sheet_filter: str | None = None) -> List[Dict[str, Any]]:
"""Generate data structure for Excel-like visualization"""
# Get comparison data for the specified sheet
summary = self.get_comparison_summary(sheet_filter)
mismatch_details = summary['mismatch_details']
visualize_rows = []
# Helper function to create a row
def create_row(coordi_title="", coordi_chapter="", kst_title="", kst_chapter="",
row_type="matched", reason="", title_for_sort=""):
return {
'coordi_title': coordi_title,
'coordi_chapter': coordi_chapter,
'kst_title': kst_title,
'kst_chapter': kst_chapter,
'row_type': row_type,
'reason': reason,
'title_for_sort': title_for_sort or coordi_title or kst_title,
'priority': 1 if row_type != 'matched' else 2 # Mismatches first
}
# 1. Handle Coordi-only items
for item in mismatch_details['coordi_only']:
visualize_rows.append(create_row(
coordi_title=item['title'],
coordi_chapter=item['episode'],
row_type='coordi_only',
reason='Only in Coordi'
))
# 2. Handle KST-only items
for item in mismatch_details['kst_only']:
visualize_rows.append(create_row(
kst_title=item['title'],
kst_chapter=item['episode'],
row_type='kst_only',
reason='Only in KST'
))
# 3. Handle Mixed duplicates (exists in both but duplicated on one side)
mixed_items = {} # Group by title+episode
for item in mismatch_details['mixed_duplicates']:
key = f"{item['title']}_{item['episode']}"
if key not in mixed_items:
mixed_items[key] = {
'title': item['title'],
'episode': item['episode'],
'has_kst_duplicate': False,
'has_coordi_duplicate': False
}
if item['duplicate_side'] == 'KST':
mixed_items[key]['has_kst_duplicate'] = True
elif item['duplicate_side'] == 'COORDI':
mixed_items[key]['has_coordi_duplicate'] = True
for key, item in mixed_items.items():
# First row: show it exists in both
visualize_rows.append(create_row(
coordi_title=item['title'],
coordi_chapter=item['episode'],
kst_title=item['title'],
kst_chapter=item['episode'],
row_type='mixed_duplicate',
reason='Mixed duplicate'
))
# Additional rows for duplicates
if item['has_kst_duplicate']:
visualize_rows.append(create_row(
kst_title=item['title'],
kst_chapter=item['episode'],
row_type='mixed_duplicate',
reason='Duplicate in KST',
title_for_sort=item['title']
))
if item['has_coordi_duplicate']:
visualize_rows.append(create_row(
coordi_title=item['title'],
coordi_chapter=item['episode'],
row_type='mixed_duplicate',
reason='Duplicate in Coordi',
title_for_sort=item['title']
))
# 4. Handle Pure duplicates
for item in mismatch_details['kst_duplicates']:
visualize_rows.append(create_row(
kst_title=item['title'],
kst_chapter=item['episode'],
row_type='pure_duplicate',
reason='Duplicate in KST'
))
for item in mismatch_details['coordi_duplicates']:
visualize_rows.append(create_row(
coordi_title=item['title'],
coordi_chapter=item['episode'],
row_type='pure_duplicate',
reason='Duplicate in Coordi'
))
# 5. Handle Matched items (perfect matches)
matched_by_title = summary['grouped_by_title']['matched_by_title']
for title, items in matched_by_title.items():
for item in items:
visualize_rows.append(create_row(
coordi_title=item['title'],
coordi_chapter=item['episode'],
kst_title=item['title'],
kst_chapter=item['episode'],
row_type='matched',
reason='Perfect match'
))
# Sort: Mismatches first (priority 1), then matches (priority 2), then by Korean title + chapter
def sort_key(x):
# Extract episode number for proper numeric sorting
coordi_episode = x.get('coordi_chapter', '') or ''
kst_episode = x.get('kst_chapter', '') or ''
episode = coordi_episode or kst_episode
# Try to convert episode to number for proper sorting, fallback to string
try:
episode_num = float(episode) if episode else 0
except (ValueError, TypeError):
episode_num = 0
return (x['priority'], x['title_for_sort'], episode_num)
visualize_rows.sort(key=sort_key)
return visualize_rows
def print_comparison_summary(self, sheet_filter: str = None):
def print_comparison_summary(self, sheet_filter: str | None = None):
"""Print a formatted summary of the comparison for a specific sheet"""
summary = self.get_comparison_summary(sheet_filter)

View File

@ -173,6 +173,32 @@
border: 1px solid #ddd;
border-radius: 4px;
}
/* Vibrant color styles for Visualize tab */
.coordi-only-row {
background-color: #ff4444 !important; /* Bright red */
color: white;
}
.kst-only-row {
background-color: #4488ff !important; /* Bright blue */
color: white;
}
.mixed-duplicate-row {
background-color: #ff8800 !important; /* Bright orange */
color: white;
}
.pure-duplicate-row {
background-color: #8844ff !important; /* Bright purple */
color: white;
}
.matched-row {
background-color: white !important; /* White background */
color: black;
}
</style>
</head>
<body>
@ -203,6 +229,7 @@
<div class="tabs">
<div class="tab active" onclick="showTab('summary')">Summary</div>
<div class="tab" onclick="showTab('different')">Different</div>
<div class="tab" onclick="showTab('visualize')">Visualize</div>
</div>
<div id="summary" class="tab-content active">
@ -255,6 +282,25 @@
</table>
</div>
</div>
<div id="visualize" class="tab-content">
<h3>Data </h3>
<div class="table-container">
<table id="visualize-table">
<thead>
<tr>
<th>Coordi Title</th>
<th>Coordi Chapter</th>
<th>KST Title</th>
<th>KST Chapter</th>
<th>Status</th>
</tr>
</thead>
<tbody id="visualize-table-body">
</tbody>
</table>
</div>
</div>
</div>
</div>
@ -468,6 +514,9 @@
// Update Different tab
updateDifferentTable(results.mismatch_details);
// Update Visualize tab
updateVisualizeTable(results.visualize_data);
}
function updateSummaryTable(matchedData) {
@ -587,6 +636,40 @@
});
}
function updateVisualizeTable(visualizeData) {
const tbody = document.getElementById('visualize-table-body');
tbody.innerHTML = '';
// Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
visualizeData.forEach(row => {
const tr = tbody.insertRow();
tr.insertCell(0).textContent = row.coordi_title || '';
tr.insertCell(1).textContent = row.coordi_chapter || '';
tr.insertCell(2).textContent = row.kst_title || '';
tr.insertCell(3).textContent = row.kst_chapter || '';
tr.insertCell(4).textContent = row.reason || '';
// Apply vibrant color highlighting based on row type
switch (row.row_type) {
case 'coordi_only':
tr.className = 'coordi-only-row';
break;
case 'kst_only':
tr.className = 'kst-only-row';
break;
case 'mixed_duplicate':
tr.className = 'mixed-duplicate-row';
break;
case 'pure_duplicate':
tr.className = 'pure-duplicate-row';
break;
case 'matched':
tr.className = 'matched-row';
break;
}
});
}
// Auto-analyze on page load with default file
window.onload = function() {
// Initialize sheet filter with loading state

View File

@ -61,6 +61,10 @@ def analyze_data():
# Add matched data to results
comparison_results['matched_data'] = matched_items_data
# Generate visualize data
visualize_data = comparator_instance.generate_visualize_data(sheet_filter)
comparison_results['visualize_data'] = visualize_data
return jsonify({
'success': True,
'results': comparison_results
@ -307,6 +311,32 @@ def create_templates_dir():
border: 1px solid #ddd;
border-radius: 4px;
}
/* Vibrant color styles for Visualize tab */
.coordi-only-row {
background-color: #ff4444 !important; /* Bright red */
color: white;
}
.kst-only-row {
background-color: #4488ff !important; /* Bright blue */
color: white;
}
.mixed-duplicate-row {
background-color: #ff8800 !important; /* Bright orange */
color: white;
}
.pure-duplicate-row {
background-color: #8844ff !important; /* Bright purple */
color: white;
}
.matched-row {
background-color: white !important; /* White background */
color: black;
}
</style>
</head>
<body>
@ -337,6 +367,7 @@ def create_templates_dir():
<div class="tabs">
<div class="tab active" onclick="showTab('summary')">Summary</div>
<div class="tab" onclick="showTab('different')">Different</div>
<div class="tab" onclick="showTab('visualize')">Visualize</div>
</div>
<div id="summary" class="tab-content active">
@ -389,6 +420,25 @@ def create_templates_dir():
</table>
</div>
</div>
<div id="visualize" class="tab-content">
<h3>Data </h3>
<div class="table-container">
<table id="visualize-table">
<thead>
<tr>
<th>Coordi Title</th>
<th>Coordi Chapter</th>
<th>KST Title</th>
<th>KST Chapter</th>
<th>Status</th>
</tr>
</thead>
<tbody id="visualize-table-body">
</tbody>
</table>
</div>
</div>
</div>
</div>
@ -602,6 +652,9 @@ def create_templates_dir():
// Update Different tab
updateDifferentTable(results.mismatch_details);
// Update Visualize tab
updateVisualizeTable(results.visualize_data);
}
function updateSummaryTable(matchedData) {
@ -721,6 +774,40 @@ def create_templates_dir():
});
}
function updateVisualizeTable(visualizeData) {
const tbody = document.getElementById('visualize-table-body');
tbody.innerHTML = '';
// Data is already sorted by the backend (mismatches first, then matches, all by Korean title)
visualizeData.forEach(row => {
const tr = tbody.insertRow();
tr.insertCell(0).textContent = row.coordi_title || '';
tr.insertCell(1).textContent = row.coordi_chapter || '';
tr.insertCell(2).textContent = row.kst_title || '';
tr.insertCell(3).textContent = row.kst_chapter || '';
tr.insertCell(4).textContent = row.reason || '';
// Apply vibrant color highlighting based on row type
switch (row.row_type) {
case 'coordi_only':
tr.className = 'coordi-only-row';
break;
case 'kst_only':
tr.className = 'kst-only-row';
break;
case 'mixed_duplicate':
tr.className = 'mixed-duplicate-row';
break;
case 'pure_duplicate':
tr.className = 'pure-duplicate-row';
break;
case 'matched':
tr.className = 'matched-row';
break;
}
});
}
// Auto-analyze on page load with default file
window.onload = function() {
// Initialize sheet filter with loading state