compare data logic

This commit is contained in:
arthur 2025-08-20 14:58:30 +07:00
parent 47097f6be4
commit 1f88db5fb9
7 changed files with 495 additions and 303 deletions

View File

@ -1,6 +1,6 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from typing import Dict, List, Tuple, Any, Set from typing import Dict, List, Tuple, Any
from dataclasses import dataclass from dataclasses import dataclass
@dataclass @dataclass
@ -48,6 +48,8 @@ class KSTCoordiComparator:
coordi_items = set() coordi_items = set()
kst_details = [] kst_details = []
coordi_details = [] coordi_details = []
kst_all_items = [] # Keep all items including duplicates
coordi_all_items = [] # Keep all items including duplicates
for sheet_name, df in self.data.items(): for sheet_name, df in self.data.items():
columns = df.columns.tolist() columns = df.columns.tolist()
@ -96,6 +98,7 @@ class KSTCoordiComparator:
if has_kst_data: if has_kst_data:
item = ComparisonItem(kst_title, kst_episode, sheet_name, idx) item = ComparisonItem(kst_title, kst_episode, sheet_name, idx)
kst_items.add(item) kst_items.add(item)
kst_all_items.append(item) # Keep all items for duplicate detection
kst_details.append({ kst_details.append({
'title': kst_title, 'title': kst_title,
'episode': kst_episode, 'episode': kst_episode,
@ -122,6 +125,7 @@ class KSTCoordiComparator:
if has_coordi_data: if has_coordi_data:
item = ComparisonItem(coordi_title, coordi_episode, sheet_name, idx) item = ComparisonItem(coordi_title, coordi_episode, sheet_name, idx)
coordi_items.add(item) coordi_items.add(item)
coordi_all_items.append(item) # Keep all items for duplicate detection
coordi_details.append({ coordi_details.append({
'title': coordi_title, 'title': coordi_title,
'episode': coordi_episode, 'episode': coordi_episode,
@ -135,12 +139,16 @@ class KSTCoordiComparator:
self.kst_items = kst_items self.kst_items = kst_items
self.coordi_items = coordi_items self.coordi_items = coordi_items
self.kst_all_items = kst_all_items # Store for duplicate detection
self.coordi_all_items = coordi_all_items # Store for duplicate detection
return { return {
'kst_items': kst_items, 'kst_items': kst_items,
'coordi_items': coordi_items, 'coordi_items': coordi_items,
'kst_details': kst_details, 'kst_details': kst_details,
'coordi_details': coordi_details 'coordi_details': coordi_details,
'kst_all_items': kst_all_items,
'coordi_all_items': coordi_all_items
} }
def categorize_mismatches(self) -> Dict[str, Any]: def categorize_mismatches(self) -> Dict[str, Any]:
@ -154,8 +162,8 @@ class KSTCoordiComparator:
coordi_only_items = self.coordi_items - self.kst_items coordi_only_items = self.coordi_items - self.kst_items
# Find duplicates within each dataset # Find duplicates within each dataset
kst_duplicates = self._find_duplicates_in_set(self.kst_items) kst_duplicates = self._find_duplicates_in_list(self.kst_all_items)
coordi_duplicates = self._find_duplicates_in_set(self.coordi_items) coordi_duplicates = self._find_duplicates_in_list(self.coordi_all_items)
categorization = { categorization = {
'matched_items': list(matched_items), 'matched_items': list(matched_items),
@ -190,10 +198,8 @@ class KSTCoordiComparator:
return categorization return categorization
def _find_duplicates_in_set(self, items_set: Set[ComparisonItem]) -> List[ComparisonItem]: def _find_duplicates_in_list(self, items_list: List[ComparisonItem]) -> List[ComparisonItem]:
"""Find duplicate items within a dataset""" """Find duplicate items within a dataset"""
# Convert to list to check for duplicates
items_list = list(items_set)
seen = set() seen = set()
duplicates = [] duplicates = []

View File

@ -1,319 +1,319 @@
import tkinter as tk # import tkinter as tk
from tkinter import ttk, filedialog, messagebox # from tkinter import ttk, filedialog, messagebox
import pandas as pd # import pandas as pd
from pathlib import Path # from pathlib import Path
from data_comparator import KSTCoordiComparator # from data_comparator import KSTCoordiComparator
class DataComparisonGUI: # class DataComparisonGUI:
def __init__(self, root): # def __init__(self, root):
self.root = root # self.root = root
self.root.title("KST vs Coordi Data Comparison Tool") # self.root.title("KST vs Coordi Data Comparison Tool")
self.root.geometry("1200x800") # self.root.geometry("1200x800")
self.comparator = None # self.comparator = None
self.comparison_data = None # self.comparison_data = None
self.setup_ui() # self.setup_ui()
def setup_ui(self): # def setup_ui(self):
# Main container # # Main container
main_frame = ttk.Frame(self.root, padding="10") # main_frame = ttk.Frame(self.root, padding="10")
main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# Configure grid weights # # Configure grid weights
self.root.columnconfigure(0, weight=1) # self.root.columnconfigure(0, weight=1)
self.root.rowconfigure(0, weight=1) # self.root.rowconfigure(0, weight=1)
main_frame.columnconfigure(1, weight=1) # main_frame.columnconfigure(1, weight=1)
main_frame.rowconfigure(2, weight=1) # main_frame.rowconfigure(2, weight=1)
# Title # # Title
title_label = ttk.Label(main_frame, text="KST vs Coordi Data Comparison", # title_label = ttk.Label(main_frame, text="KST vs Coordi Data Comparison",
font=("Arial", 16, "bold")) # font=("Arial", 16, "bold"))
title_label.grid(row=0, column=0, columnspan=3, pady=(0, 20)) # title_label.grid(row=0, column=0, columnspan=3, pady=(0, 20))
# File selection frame # # File selection frame
file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10") # file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10")
file_frame.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 10)) # file_frame.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 10))
file_frame.columnconfigure(1, weight=1) # file_frame.columnconfigure(1, weight=1)
ttk.Label(file_frame, text="Excel File:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10)) # ttk.Label(file_frame, text="Excel File:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10))
self.file_path_var = tk.StringVar(value="data/sample-data.xlsx") # self.file_path_var = tk.StringVar(value="data/sample-data.xlsx")
self.file_entry = ttk.Entry(file_frame, textvariable=self.file_path_var, width=50) # self.file_entry = ttk.Entry(file_frame, textvariable=self.file_path_var, width=50)
self.file_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10)) # self.file_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10))
browse_btn = ttk.Button(file_frame, text="Browse", command=self.browse_file) # browse_btn = ttk.Button(file_frame, text="Browse", command=self.browse_file)
browse_btn.grid(row=0, column=2) # browse_btn.grid(row=0, column=2)
analyze_btn = ttk.Button(file_frame, text="Analyze Data", command=self.analyze_data) # analyze_btn = ttk.Button(file_frame, text="Analyze Data", command=self.analyze_data)
analyze_btn.grid(row=0, column=3, padx=(10, 0)) # analyze_btn.grid(row=0, column=3, padx=(10, 0))
# Results notebook (tabs) # # Results notebook (tabs)
self.notebook = ttk.Notebook(main_frame) # self.notebook = ttk.Notebook(main_frame)
self.notebook.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S)) # self.notebook.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S))
# Create tabs # # Create tabs
self.create_summary_tab() # self.create_summary_tab()
self.create_matched_tab() # self.create_matched_tab()
self.create_kst_only_tab() # self.create_kst_only_tab()
self.create_coordi_only_tab() # self.create_coordi_only_tab()
# Status bar # # Status bar
self.status_var = tk.StringVar(value="Ready - Select an Excel file and click 'Analyze Data'") # self.status_var = tk.StringVar(value="Ready - Select an Excel file and click 'Analyze Data'")
status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN) # status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN)
status_bar.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(10, 0)) # status_bar.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(10, 0))
def create_summary_tab(self): # def create_summary_tab(self):
# Summary tab # # Summary tab
summary_frame = ttk.Frame(self.notebook) # summary_frame = ttk.Frame(self.notebook)
self.notebook.add(summary_frame, text="Summary") # self.notebook.add(summary_frame, text="Summary")
# Configure grid # # Configure grid
summary_frame.columnconfigure(0, weight=1) # summary_frame.columnconfigure(0, weight=1)
summary_frame.rowconfigure(1, weight=1) # summary_frame.rowconfigure(1, weight=1)
# Summary text widget # # Summary text widget
summary_text_frame = ttk.LabelFrame(summary_frame, text="Comparison Summary", padding="10") # summary_text_frame = ttk.LabelFrame(summary_frame, text="Comparison Summary", padding="10")
summary_text_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10) # summary_text_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10)
summary_text_frame.columnconfigure(0, weight=1) # summary_text_frame.columnconfigure(0, weight=1)
summary_text_frame.rowconfigure(0, weight=1) # summary_text_frame.rowconfigure(0, weight=1)
self.summary_text = tk.Text(summary_text_frame, wrap=tk.WORD, height=15) # self.summary_text = tk.Text(summary_text_frame, wrap=tk.WORD, height=15)
summary_scrollbar = ttk.Scrollbar(summary_text_frame, orient=tk.VERTICAL, command=self.summary_text.yview) # summary_scrollbar = ttk.Scrollbar(summary_text_frame, orient=tk.VERTICAL, command=self.summary_text.yview)
self.summary_text.configure(yscrollcommand=summary_scrollbar.set) # self.summary_text.configure(yscrollcommand=summary_scrollbar.set)
self.summary_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # self.summary_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
summary_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) # summary_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# Reconciliation info # # Reconciliation info
reconcile_frame = ttk.LabelFrame(summary_frame, text="Reconciliation Results", padding="10") # reconcile_frame = ttk.LabelFrame(summary_frame, text="Reconciliation Results", padding="10")
reconcile_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), padx=10, pady=(0, 10)) # reconcile_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), padx=10, pady=(0, 10))
self.reconcile_text = tk.Text(reconcile_frame, wrap=tk.WORD, height=8) # self.reconcile_text = tk.Text(reconcile_frame, wrap=tk.WORD, height=8)
reconcile_scrollbar = ttk.Scrollbar(reconcile_frame, orient=tk.VERTICAL, command=self.reconcile_text.yview) # reconcile_scrollbar = ttk.Scrollbar(reconcile_frame, orient=tk.VERTICAL, command=self.reconcile_text.yview)
self.reconcile_text.configure(yscrollcommand=reconcile_scrollbar.set) # self.reconcile_text.configure(yscrollcommand=reconcile_scrollbar.set)
self.reconcile_text.grid(row=0, column=0, sticky=(tk.W, tk.E)) # self.reconcile_text.grid(row=0, column=0, sticky=(tk.W, tk.E))
reconcile_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) # reconcile_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
reconcile_frame.columnconfigure(0, weight=1) # reconcile_frame.columnconfigure(0, weight=1)
def create_matched_tab(self): # def create_matched_tab(self):
matched_frame = ttk.Frame(self.notebook) # matched_frame = ttk.Frame(self.notebook)
self.notebook.add(matched_frame, text="Matched Items") # self.notebook.add(matched_frame, text="Matched Items")
self.create_data_table(matched_frame, "matched") # self.create_data_table(matched_frame, "matched")
def create_kst_only_tab(self): # def create_kst_only_tab(self):
kst_frame = ttk.Frame(self.notebook) # kst_frame = ttk.Frame(self.notebook)
self.notebook.add(kst_frame, text="KST Only") # self.notebook.add(kst_frame, text="KST Only")
self.create_data_table(kst_frame, "kst_only") # self.create_data_table(kst_frame, "kst_only")
def create_coordi_only_tab(self): # def create_coordi_only_tab(self):
coordi_frame = ttk.Frame(self.notebook) # coordi_frame = ttk.Frame(self.notebook)
self.notebook.add(coordi_frame, text="Coordi Only") # self.notebook.add(coordi_frame, text="Coordi Only")
self.create_data_table(coordi_frame, "coordi_only") # self.create_data_table(coordi_frame, "coordi_only")
def create_data_table(self, parent, table_type): # def create_data_table(self, parent, table_type):
# Configure grid # # Configure grid
parent.columnconfigure(0, weight=1) # parent.columnconfigure(0, weight=1)
parent.rowconfigure(1, weight=1) # parent.rowconfigure(1, weight=1)
# Info label # # Info label
info_frame = ttk.Frame(parent) # info_frame = ttk.Frame(parent)
info_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=10, pady=10) # info_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=10, pady=10)
info_frame.columnconfigure(1, weight=1) # info_frame.columnconfigure(1, weight=1)
count_label = ttk.Label(info_frame, text="Count:") # count_label = ttk.Label(info_frame, text="Count:")
count_label.grid(row=0, column=0, padx=(0, 10)) # count_label.grid(row=0, column=0, padx=(0, 10))
count_var = tk.StringVar(value="0") # count_var = tk.StringVar(value="0")
setattr(self, f"{table_type}_count_var", count_var) # setattr(self, f"{table_type}_count_var", count_var)
count_display = ttk.Label(info_frame, textvariable=count_var, font=("Arial", 10, "bold")) # count_display = ttk.Label(info_frame, textvariable=count_var, font=("Arial", 10, "bold"))
count_display.grid(row=0, column=1, sticky=tk.W) # count_display.grid(row=0, column=1, sticky=tk.W)
# Table frame # # Table frame
table_frame = ttk.Frame(parent) # table_frame = ttk.Frame(parent)
table_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=(0, 10)) # table_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=(0, 10))
table_frame.columnconfigure(0, weight=1) # table_frame.columnconfigure(0, weight=1)
table_frame.rowconfigure(0, weight=1) # table_frame.rowconfigure(0, weight=1)
# Create treeview # # Create treeview
columns = ("Title", "Episode", "Sheet", "Row", "Reason") # columns = ("Title", "Episode", "Sheet", "Row", "Reason")
tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20) # tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20)
# Configure columns # # Configure columns
tree.heading("Title", text="Title") # tree.heading("Title", text="Title")
tree.heading("Episode", text="Episode") # tree.heading("Episode", text="Episode")
tree.heading("Sheet", text="Sheet") # tree.heading("Sheet", text="Sheet")
tree.heading("Row", text="Row") # tree.heading("Row", text="Row")
tree.heading("Reason", text="Reason") # tree.heading("Reason", text="Reason")
tree.column("Title", width=300) # tree.column("Title", width=300)
tree.column("Episode", width=100) # tree.column("Episode", width=100)
tree.column("Sheet", width=120) # tree.column("Sheet", width=120)
tree.column("Row", width=80) # tree.column("Row", width=80)
tree.column("Reason", width=300) # tree.column("Reason", width=300)
# Scrollbars # # Scrollbars
v_scrollbar = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=tree.yview) # v_scrollbar = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=tree.yview)
h_scrollbar = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=tree.xview) # h_scrollbar = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=tree.xview)
tree.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set) # tree.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set)
tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
v_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) # v_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
h_scrollbar.grid(row=1, column=0, sticky=(tk.W, tk.E)) # h_scrollbar.grid(row=1, column=0, sticky=(tk.W, tk.E))
# Store tree widget # # Store tree widget
setattr(self, f"{table_type}_tree", tree) # setattr(self, f"{table_type}_tree", tree)
def browse_file(self): # def browse_file(self):
file_path = filedialog.askopenfilename( # file_path = filedialog.askopenfilename(
title="Select Excel File", # title="Select Excel File",
filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")] # filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")]
) # )
if file_path: # if file_path:
self.file_path_var.set(file_path) # self.file_path_var.set(file_path)
def analyze_data(self): # def analyze_data(self):
file_path = self.file_path_var.get().strip() # file_path = self.file_path_var.get().strip()
if not file_path: # if not file_path:
messagebox.showerror("Error", "Please select an Excel file") # messagebox.showerror("Error", "Please select an Excel file")
return # return
if not Path(file_path).exists(): # if not Path(file_path).exists():
messagebox.showerror("Error", f"File not found: {file_path}") # messagebox.showerror("Error", f"File not found: {file_path}")
return # return
try: # try:
self.status_var.set("Analyzing data...") # self.status_var.set("Analyzing data...")
self.root.update() # self.root.update()
# Create comparator and analyze # # Create comparator and analyze
self.comparator = KSTCoordiComparator(file_path) # self.comparator = KSTCoordiComparator(file_path)
if not self.comparator.load_data(): # if not self.comparator.load_data():
messagebox.showerror("Error", "Failed to load Excel data") # messagebox.showerror("Error", "Failed to load Excel data")
return # return
# Get comparison results # # Get comparison results
self.comparison_data = self.comparator.get_comparison_summary() # self.comparison_data = self.comparator.get_comparison_summary()
# Update GUI # # Update GUI
self.update_summary() # self.update_summary()
self.update_data_tables() # self.update_data_tables()
self.status_var.set("Analysis complete!") # self.status_var.set("Analysis complete!")
except Exception as e: # except Exception as e:
messagebox.showerror("Error", f"Analysis failed: {str(e)}") # messagebox.showerror("Error", f"Analysis failed: {str(e)}")
self.status_var.set("Analysis failed") # self.status_var.set("Analysis failed")
def update_summary(self): # def update_summary(self):
if not self.comparison_data: # if not self.comparison_data:
return # return
# Clear previous content # # Clear previous content
self.summary_text.delete(1.0, tk.END) # self.summary_text.delete(1.0, tk.END)
self.reconcile_text.delete(1.0, tk.END) # self.reconcile_text.delete(1.0, tk.END)
data = self.comparison_data # data = self.comparison_data
# Summary text # # Summary text
summary = f"""COMPARISON SUMMARY # summary = f"""COMPARISON SUMMARY
{'='*50} # {'='*50}
Original Counts: # Original Counts:
KST Total: {data['original_counts']['kst_total']:,} # KST Total: {data['original_counts']['kst_total']:,}
Coordi Total: {data['original_counts']['coordi_total']:,} # Coordi Total: {data['original_counts']['coordi_total']:,}
Matched Items: {data['matched_items_count']:,} # Matched Items: {data['matched_items_count']:,}
Mismatches: # Mismatches:
KST Only: {data['mismatches']['kst_only_count']:,} # KST Only: {data['mismatches']['kst_only_count']:,}
Coordi Only: {data['mismatches']['coordi_only_count']:,} # Coordi Only: {data['mismatches']['coordi_only_count']:,}
KST Duplicates: {data['mismatches']['kst_duplicates_count']:,} # KST Duplicates: {data['mismatches']['kst_duplicates_count']:,}
Coordi Duplicates: {data['mismatches']['coordi_duplicates_count']:,} # Coordi Duplicates: {data['mismatches']['coordi_duplicates_count']:,}
Total Mismatches: {data['mismatches']['kst_only_count'] + data['mismatches']['coordi_only_count'] + data['mismatches']['kst_duplicates_count'] + data['mismatches']['coordi_duplicates_count']:,} # Total Mismatches: {data['mismatches']['kst_only_count'] + data['mismatches']['coordi_only_count'] + data['mismatches']['kst_duplicates_count'] + data['mismatches']['coordi_duplicates_count']:,}
""" # """
self.summary_text.insert(tk.END, summary) # self.summary_text.insert(tk.END, summary)
# Reconciliation text # # Reconciliation text
reconcile = data['reconciliation'] # reconcile = data['reconciliation']
reconcile_info = f"""RECONCILIATION RESULTS # reconcile_info = f"""RECONCILIATION RESULTS
{'='*40} # {'='*40}
After excluding mismatches: # After excluding mismatches:
KST Count: {reconcile['reconciled_kst_count']:,} # KST Count: {reconcile['reconciled_kst_count']:,}
Coordi Count: {reconcile['reconciled_coordi_count']:,} # Coordi Count: {reconcile['reconciled_coordi_count']:,}
Counts Match: {'✅ YES' if reconcile['counts_match_after_reconciliation'] else '❌ NO'} # Counts Match: {'✅ YES' if reconcile['counts_match_after_reconciliation'] else '❌ NO'}
Items to exclude: # Items to exclude:
From KST: {reconcile['items_to_exclude_from_kst']:,} # From KST: {reconcile['items_to_exclude_from_kst']:,}
From Coordi: {reconcile['items_to_exclude_from_coordi']:,} # From Coordi: {reconcile['items_to_exclude_from_coordi']:,}
Final Result: Both datasets will have {reconcile['reconciled_kst_count']:,} matching items after reconciliation. # Final Result: Both datasets will have {reconcile['reconciled_kst_count']:,} matching items after reconciliation.
""" # """
self.reconcile_text.insert(tk.END, reconcile_info) # self.reconcile_text.insert(tk.END, reconcile_info)
def update_data_tables(self): # def update_data_tables(self):
if not self.comparison_data: # if not self.comparison_data:
return # return
mismatches = self.comparison_data['mismatch_details'] # mismatches = self.comparison_data['mismatch_details']
# Update matched items (create from intersection) # # Update matched items (create from intersection)
matched_count = self.comparison_data['matched_items_count'] # matched_count = self.comparison_data['matched_items_count']
self.matched_count_var.set(f"{matched_count:,}") # self.matched_count_var.set(f"{matched_count:,}")
# Clear matched tree # # Clear matched tree
for item in self.matched_tree.get_children(): # for item in self.matched_tree.get_children():
self.matched_tree.delete(item) # self.matched_tree.delete(item)
# Add matched items (we'll show the first few as examples) # # Add matched items (we'll show the first few as examples)
if self.comparator: # if self.comparator:
categorization = self.comparator.categorize_mismatches() # categorization = self.comparator.categorize_mismatches()
matched_items = categorization['matched_items'] # matched_items = categorization['matched_items']
for i, item in enumerate(list(matched_items)[:100]): # Show first 100 # for i, item in enumerate(list(matched_items)[:100]): # Show first 100
self.matched_tree.insert("", tk.END, values=( # self.matched_tree.insert("", tk.END, values=(
item.title, item.episode, item.source_sheet, item.row_index + 1, "Perfect match" # item.title, item.episode, item.source_sheet, item.row_index + 1, "Perfect match"
)) # ))
# Update KST only # # Update KST only
kst_only = mismatches['kst_only'] # kst_only = mismatches['kst_only']
self.kst_only_count_var.set(f"{len(kst_only):,}") # self.kst_only_count_var.set(f"{len(kst_only):,}")
for item in self.kst_only_tree.get_children(): # for item in self.kst_only_tree.get_children():
self.kst_only_tree.delete(item) # self.kst_only_tree.delete(item)
for mismatch in kst_only: # for mismatch in kst_only:
self.kst_only_tree.insert("", tk.END, values=( # self.kst_only_tree.insert("", tk.END, values=(
mismatch['title'], mismatch['episode'], mismatch['sheet'], # mismatch['title'], mismatch['episode'], mismatch['sheet'],
mismatch['row_index'] + 1, mismatch['reason'] # mismatch['row_index'] + 1, mismatch['reason']
)) # ))
# Update Coordi only # # Update Coordi only
coordi_only = mismatches['coordi_only'] # coordi_only = mismatches['coordi_only']
self.coordi_only_count_var.set(f"{len(coordi_only):,}") # self.coordi_only_count_var.set(f"{len(coordi_only):,}")
for item in self.coordi_only_tree.get_children(): # for item in self.coordi_only_tree.get_children():
self.coordi_only_tree.delete(item) # self.coordi_only_tree.delete(item)
for mismatch in coordi_only: # for mismatch in coordi_only:
self.coordi_only_tree.insert("", tk.END, values=( # self.coordi_only_tree.insert("", tk.END, values=(
mismatch['title'], mismatch['episode'], mismatch['sheet'], # mismatch['title'], mismatch['episode'], mismatch['sheet'],
mismatch['row_index'] + 1, mismatch['reason'] # mismatch['row_index'] + 1, mismatch['reason']
)) # ))
def main(): # def main():
root = tk.Tk() # root = tk.Tk()
app = DataComparisonGUI(root) # app = DataComparisonGUI(root)
root.mainloop() # root.mainloop()
if __name__ == "__main__": # if __name__ == "__main__":
main() # main()

View File

@ -404,6 +404,7 @@
// Update count displays // Update count displays
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString(); document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
// Count all different items including duplicates
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count + const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count; results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count;
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString(); document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
@ -443,50 +444,44 @@
const tbody = document.getElementById('different-table'); const tbody = document.getElementById('different-table');
tbody.innerHTML = ''; tbody.innerHTML = '';
// Combine all mismatches into one array for sorting // Create sets of duplicate items for highlighting
const kstDuplicateKeys = new Set();
const coordiDuplicateKeys = new Set();
mismatchDetails.kst_duplicates.forEach(item => {
kstDuplicateKeys.add(`${item.title}_${item.episode}`);
});
mismatchDetails.coordi_duplicates.forEach(item => {
coordiDuplicateKeys.add(`${item.title}_${item.episode}`);
});
// Combine only KST-only and Coordi-only items (like before)
const allDifferences = []; const allDifferences = [];
// Add KST-only items // Add KST-only items
mismatchDetails.kst_only.forEach(item => { mismatchDetails.kst_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({ allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`, kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '', coordiData: '',
reason: 'Only appears in KST', reason: 'Only appears in KST',
sortTitle: item.title, sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0 sortEpisode: parseFloat(item.episode) || 0,
isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate
}); });
}); });
// Add Coordi-only items // Add Coordi-only items
mismatchDetails.coordi_only.forEach(item => { mismatchDetails.coordi_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({ allDifferences.push({
kstData: '', kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`, coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Only appears in Coordi', reason: 'Only appears in Coordi',
sortTitle: item.title, sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0 sortEpisode: parseFloat(item.episode) || 0,
}); isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate
});
// Add KST duplicates
mismatchDetails.kst_duplicates.forEach(item => {
allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '',
reason: 'Duplicate in KST',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
});
});
// Add Coordi duplicates
mismatchDetails.coordi_duplicates.forEach(item => {
allDifferences.push({
kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Duplicate in Coordi',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
}); });
}); });
@ -497,12 +492,18 @@
return a.sortEpisode - b.sortEpisode; return a.sortEpisode - b.sortEpisode;
}); });
// Populate table // Populate table with highlighting
allDifferences.forEach(diff => { allDifferences.forEach(diff => {
const row = tbody.insertRow(); const row = tbody.insertRow();
row.insertCell(0).textContent = diff.kstData; row.insertCell(0).textContent = diff.kstData;
row.insertCell(1).textContent = diff.coordiData; row.insertCell(1).textContent = diff.coordiData;
row.insertCell(2).textContent = diff.reason; row.insertCell(2).textContent = diff.reason;
// Highlight row in yellow if it's also a duplicate
if (diff.isDuplicate) {
row.style.backgroundColor = '#fff3cd'; // Light yellow
row.title = 'This item also has duplicates in the dataset';
}
}); });
} }

64
test_duplicates.py Normal file
View File

@ -0,0 +1,64 @@
from data_comparator import KSTCoordiComparator
def test_duplicate_detection():
comparator = KSTCoordiComparator('data/sample-data.xlsx')
if comparator.load_data():
print("=== DUPLICATE DETECTION TEST ===")
# Get the data extraction results
data = comparator.extract_kst_coordi_items()
print(f"Total KST items (unique): {len(data['kst_items'])}")
print(f"Total KST items (all): {len(data['kst_all_items'])}")
print(f"Total Coordi items (unique): {len(data['coordi_items'])}")
print(f"Total Coordi items (all): {len(data['coordi_all_items'])}")
# Check for duplicates
categorization = comparator.categorize_mismatches()
print(f"\nKST duplicates found: {len(categorization['kst_duplicates'])}")
print(f"Coordi duplicates found: {len(categorization['coordi_duplicates'])}")
# Show sample duplicates
if categorization['kst_duplicates']:
print("\nSample KST duplicates:")
for i, dup in enumerate(categorization['kst_duplicates'][:3]):
print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})")
if categorization['coordi_duplicates']:
print("\nSample Coordi duplicates:")
for i, dup in enumerate(categorization['coordi_duplicates'][:3]):
print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})")
# Check for the specific example: 백라이트 - Episode 53-1x(휴재)
mismatch_details = comparator.generate_mismatch_details()
print(f"\nLooking for '백라이트 - Episode 53-1x(휴재)':")
# Check in KST-only
backlight_kst_only = [item for item in mismatch_details['kst_only']
if '백라이트' in item['title'] and '53-1x' in item['episode']]
# Check in KST duplicates
backlight_kst_dup = [item for item in mismatch_details['kst_duplicates']
if '백라이트' in item['title'] and '53-1x' in item['episode']]
print(f" Found in KST-only: {len(backlight_kst_only)}")
print(f" Found in KST duplicates: {len(backlight_kst_dup)}")
if backlight_kst_only:
print(f" KST-only details: {backlight_kst_only[0]}")
if backlight_kst_dup:
print(f" KST duplicate details: {backlight_kst_dup[0]}")
# Test the web interface logic
print(f"\n=== Testing Web Interface Logic ===")
summary = comparator.get_comparison_summary()
print(f"Web interface will show:")
print(f" Total different items: {summary['mismatches']['kst_only_count'] + summary['mismatches']['coordi_only_count'] + summary['mismatches']['kst_duplicates_count'] + summary['mismatches']['coordi_duplicates_count']}")
print("\n✓ Duplicate detection test complete!")
print("✓ Check the web interface at http://localhost:8080 to see combined reasons")
if __name__ == "__main__":
test_duplicate_detection()

View File

@ -0,0 +1,52 @@
import requests
def test_final_duplicate_fix():
print("=== FINAL DUPLICATE FIX TEST ===")
try:
# Test the analyze endpoint
response = requests.post('http://localhost:8081/analyze',
json={'file_path': 'data/sample-data.xlsx'},
timeout=30)
if response.status_code == 200:
data = response.json()
if data.get('success'):
results = data['results']
print("✓ Analysis successful!")
print(f" Matched items: {results['matched_items_count']}")
print(f" KST only: {results['mismatches']['kst_only_count']}")
print(f" Coordi only: {results['mismatches']['coordi_only_count']}")
print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}")
print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}")
total_different = (results['mismatches']['kst_only_count'] +
results['mismatches']['coordi_only_count'] +
results['mismatches']['kst_duplicates_count'] +
results['mismatches']['coordi_duplicates_count'])
print(f" Total different items: {total_different}")
# Check for the specific example
kst_duplicates = results['mismatch_details']['kst_duplicates']
backlight_duplicates = [item for item in kst_duplicates
if '백라이트' in item['title'] and '53-1x' in item['episode']]
if backlight_duplicates:
print(f"\n✓ Found 백라이트 duplicates: {len(backlight_duplicates)}")
print(f" Example: {backlight_duplicates[0]['title']} - Episode {backlight_duplicates[0]['episode']}")
print(f"\n✓ Web interface ready at http://localhost:8081")
print("✓ The 'Different' tab will now show combined reasons like:")
print(" 백라이트 - Episode 53-1x(휴재) | (empty) | Only appears in KST + Duplicate in KST")
else:
print(f"✗ Analysis failed: {data.get('error')}")
else:
print(f"✗ Request failed: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Request failed: {e}")
if __name__ == "__main__":
test_final_duplicate_fix()

View File

@ -0,0 +1,68 @@
import requests
def test_simplified_duplicates():
print("=== SIMPLIFIED DUPLICATE DISPLAY TEST ===")
try:
# Test the analyze endpoint
response = requests.post('http://localhost:8081/analyze',
json={'file_path': 'data/sample-data.xlsx'},
timeout=30)
if response.status_code == 200:
data = response.json()
if data.get('success'):
results = data['results']
print("✓ Analysis successful!")
print(f" Matched items: {results['matched_items_count']}")
print(f" KST only: {results['mismatches']['kst_only_count']}")
print(f" Coordi only: {results['mismatches']['coordi_only_count']}")
print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}")
print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}")
# What the count will show
total_count = (results['mismatches']['kst_only_count'] +
results['mismatches']['coordi_only_count'] +
results['mismatches']['kst_duplicates_count'] +
results['mismatches']['coordi_duplicates_count'])
# What the table will show
table_rows = results['mismatches']['kst_only_count'] + results['mismatches']['coordi_only_count']
print(f"\n📊 DISPLAY LOGIC:")
print(f" Count badge shows: {total_count} items (all different items)")
print(f" Table shows: {table_rows} rows (only KST-only + Coordi-only)")
print(f" Yellow highlights: Items that are also duplicates")
# Check for 백라이트 example
kst_only = results['mismatch_details']['kst_only']
kst_duplicates = results['mismatch_details']['kst_duplicates']
backlight_kst_only = [item for item in kst_only
if '백라이트' in item['title'] and '53-1x' in item['episode']]
backlight_kst_dup = [item for item in kst_duplicates
if '백라이트' in item['title'] and '53-1x' in item['episode']]
if backlight_kst_only and backlight_kst_dup:
print(f"\n✓ 백라이트 example works:")
print(f" - Appears in table (KST-only): YES")
print(f" - Will be highlighted yellow: YES (also duplicate)")
print(f" - Contributes to count: 2 items (1 KST-only + 1 duplicate)")
print(f"\n✓ Web interface ready at http://localhost:8081")
print("✓ Check the 'Different' tab:")
print(" - Count shows all different items")
print(" - Table shows only KST-only + Coordi-only")
print(" - Yellow rows = items that also have duplicates")
else:
print(f"✗ Analysis failed: {data.get('error')}")
else:
print(f"✗ Request failed: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Request failed: {e}")
if __name__ == "__main__":
test_simplified_duplicates()

View File

@ -512,6 +512,7 @@ def create_templates_dir():
// Update count displays // Update count displays
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString(); document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
// Count all different items including duplicates
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count + const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count; results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count;
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString(); document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
@ -551,50 +552,44 @@ def create_templates_dir():
const tbody = document.getElementById('different-table'); const tbody = document.getElementById('different-table');
tbody.innerHTML = ''; tbody.innerHTML = '';
// Combine all mismatches into one array for sorting // Create sets of duplicate items for highlighting
const kstDuplicateKeys = new Set();
const coordiDuplicateKeys = new Set();
mismatchDetails.kst_duplicates.forEach(item => {
kstDuplicateKeys.add(`${item.title}_${item.episode}`);
});
mismatchDetails.coordi_duplicates.forEach(item => {
coordiDuplicateKeys.add(`${item.title}_${item.episode}`);
});
// Combine only KST-only and Coordi-only items (like before)
const allDifferences = []; const allDifferences = [];
// Add KST-only items // Add KST-only items
mismatchDetails.kst_only.forEach(item => { mismatchDetails.kst_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({ allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`, kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '', coordiData: '',
reason: 'Only appears in KST', reason: 'Only appears in KST',
sortTitle: item.title, sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0 sortEpisode: parseFloat(item.episode) || 0,
isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate
}); });
}); });
// Add Coordi-only items // Add Coordi-only items
mismatchDetails.coordi_only.forEach(item => { mismatchDetails.coordi_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({ allDifferences.push({
kstData: '', kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`, coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Only appears in Coordi', reason: 'Only appears in Coordi',
sortTitle: item.title, sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0 sortEpisode: parseFloat(item.episode) || 0,
}); isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate
});
// Add KST duplicates
mismatchDetails.kst_duplicates.forEach(item => {
allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '',
reason: 'Duplicate in KST',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
});
});
// Add Coordi duplicates
mismatchDetails.coordi_duplicates.forEach(item => {
allDifferences.push({
kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Duplicate in Coordi',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
}); });
}); });
@ -605,12 +600,18 @@ def create_templates_dir():
return a.sortEpisode - b.sortEpisode; return a.sortEpisode - b.sortEpisode;
}); });
// Populate table // Populate table with highlighting
allDifferences.forEach(diff => { allDifferences.forEach(diff => {
const row = tbody.insertRow(); const row = tbody.insertRow();
row.insertCell(0).textContent = diff.kstData; row.insertCell(0).textContent = diff.kstData;
row.insertCell(1).textContent = diff.coordiData; row.insertCell(1).textContent = diff.coordiData;
row.insertCell(2).textContent = diff.reason; row.insertCell(2).textContent = diff.reason;
// Highlight row in yellow if it's also a duplicate
if (diff.isDuplicate) {
row.style.backgroundColor = '#fff3cd'; // Light yellow
row.title = 'This item also has duplicates in the dataset';
}
}); });
} }
@ -630,8 +631,8 @@ def main():
create_templates_dir() create_templates_dir()
print("Starting web-based GUI...") print("Starting web-based GUI...")
print("Open your browser and go to: http://localhost:8080") print("Open your browser and go to: http://localhost:8081")
app.run(debug=True, host='0.0.0.0', port=8080) app.run(debug=True, host='0.0.0.0', port=8081)
if __name__ == "__main__": if __name__ == "__main__":
main() main()