compare data logic

This commit is contained in:
arthur 2025-08-20 14:58:30 +07:00
parent 47097f6be4
commit 1f88db5fb9
7 changed files with 495 additions and 303 deletions

View File

@ -1,6 +1,6 @@
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any, Set
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass
@dataclass
@ -48,6 +48,8 @@ class KSTCoordiComparator:
coordi_items = set()
kst_details = []
coordi_details = []
kst_all_items = [] # Keep all items including duplicates
coordi_all_items = [] # Keep all items including duplicates
for sheet_name, df in self.data.items():
columns = df.columns.tolist()
@ -96,6 +98,7 @@ class KSTCoordiComparator:
if has_kst_data:
item = ComparisonItem(kst_title, kst_episode, sheet_name, idx)
kst_items.add(item)
kst_all_items.append(item) # Keep all items for duplicate detection
kst_details.append({
'title': kst_title,
'episode': kst_episode,
@ -122,6 +125,7 @@ class KSTCoordiComparator:
if has_coordi_data:
item = ComparisonItem(coordi_title, coordi_episode, sheet_name, idx)
coordi_items.add(item)
coordi_all_items.append(item) # Keep all items for duplicate detection
coordi_details.append({
'title': coordi_title,
'episode': coordi_episode,
@ -135,12 +139,16 @@ class KSTCoordiComparator:
self.kst_items = kst_items
self.coordi_items = coordi_items
self.kst_all_items = kst_all_items # Store for duplicate detection
self.coordi_all_items = coordi_all_items # Store for duplicate detection
return {
'kst_items': kst_items,
'coordi_items': coordi_items,
'kst_details': kst_details,
'coordi_details': coordi_details
'coordi_details': coordi_details,
'kst_all_items': kst_all_items,
'coordi_all_items': coordi_all_items
}
def categorize_mismatches(self) -> Dict[str, Any]:
@ -154,8 +162,8 @@ class KSTCoordiComparator:
coordi_only_items = self.coordi_items - self.kst_items
# Find duplicates within each dataset
kst_duplicates = self._find_duplicates_in_set(self.kst_items)
coordi_duplicates = self._find_duplicates_in_set(self.coordi_items)
kst_duplicates = self._find_duplicates_in_list(self.kst_all_items)
coordi_duplicates = self._find_duplicates_in_list(self.coordi_all_items)
categorization = {
'matched_items': list(matched_items),
@ -190,10 +198,8 @@ class KSTCoordiComparator:
return categorization
def _find_duplicates_in_set(self, items_set: Set[ComparisonItem]) -> List[ComparisonItem]:
def _find_duplicates_in_list(self, items_list: List[ComparisonItem]) -> List[ComparisonItem]:
"""Find duplicate items within a dataset"""
# Convert to list to check for duplicates
items_list = list(items_set)
seen = set()
duplicates = []

View File

@ -1,319 +1,319 @@
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import pandas as pd
from pathlib import Path
from data_comparator import KSTCoordiComparator
# import tkinter as tk
# from tkinter import ttk, filedialog, messagebox
# import pandas as pd
# from pathlib import Path
# from data_comparator import KSTCoordiComparator
class DataComparisonGUI:
def __init__(self, root):
self.root = root
self.root.title("KST vs Coordi Data Comparison Tool")
self.root.geometry("1200x800")
# class DataComparisonGUI:
# def __init__(self, root):
# self.root = root
# self.root.title("KST vs Coordi Data Comparison Tool")
# self.root.geometry("1200x800")
self.comparator = None
self.comparison_data = None
# self.comparator = None
# self.comparison_data = None
self.setup_ui()
# self.setup_ui()
def setup_ui(self):
# Main container
main_frame = ttk.Frame(self.root, padding="10")
main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# def setup_ui(self):
# # Main container
# main_frame = ttk.Frame(self.root, padding="10")
# main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# Configure grid weights
self.root.columnconfigure(0, weight=1)
self.root.rowconfigure(0, weight=1)
main_frame.columnconfigure(1, weight=1)
main_frame.rowconfigure(2, weight=1)
# # Configure grid weights
# self.root.columnconfigure(0, weight=1)
# self.root.rowconfigure(0, weight=1)
# main_frame.columnconfigure(1, weight=1)
# main_frame.rowconfigure(2, weight=1)
# Title
title_label = ttk.Label(main_frame, text="KST vs Coordi Data Comparison",
font=("Arial", 16, "bold"))
title_label.grid(row=0, column=0, columnspan=3, pady=(0, 20))
# # Title
# title_label = ttk.Label(main_frame, text="KST vs Coordi Data Comparison",
# font=("Arial", 16, "bold"))
# title_label.grid(row=0, column=0, columnspan=3, pady=(0, 20))
# File selection frame
file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10")
file_frame.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 10))
file_frame.columnconfigure(1, weight=1)
# # File selection frame
# file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10")
# file_frame.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 10))
# file_frame.columnconfigure(1, weight=1)
ttk.Label(file_frame, text="Excel File:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10))
# ttk.Label(file_frame, text="Excel File:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10))
self.file_path_var = tk.StringVar(value="data/sample-data.xlsx")
self.file_entry = ttk.Entry(file_frame, textvariable=self.file_path_var, width=50)
self.file_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10))
# self.file_path_var = tk.StringVar(value="data/sample-data.xlsx")
# self.file_entry = ttk.Entry(file_frame, textvariable=self.file_path_var, width=50)
# self.file_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10))
browse_btn = ttk.Button(file_frame, text="Browse", command=self.browse_file)
browse_btn.grid(row=0, column=2)
# browse_btn = ttk.Button(file_frame, text="Browse", command=self.browse_file)
# browse_btn.grid(row=0, column=2)
analyze_btn = ttk.Button(file_frame, text="Analyze Data", command=self.analyze_data)
analyze_btn.grid(row=0, column=3, padx=(10, 0))
# analyze_btn = ttk.Button(file_frame, text="Analyze Data", command=self.analyze_data)
# analyze_btn.grid(row=0, column=3, padx=(10, 0))
# Results notebook (tabs)
self.notebook = ttk.Notebook(main_frame)
self.notebook.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S))
# # Results notebook (tabs)
# self.notebook = ttk.Notebook(main_frame)
# self.notebook.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S))
# Create tabs
self.create_summary_tab()
self.create_matched_tab()
self.create_kst_only_tab()
self.create_coordi_only_tab()
# # Create tabs
# self.create_summary_tab()
# self.create_matched_tab()
# self.create_kst_only_tab()
# self.create_coordi_only_tab()
# Status bar
self.status_var = tk.StringVar(value="Ready - Select an Excel file and click 'Analyze Data'")
status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN)
status_bar.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(10, 0))
# # Status bar
# self.status_var = tk.StringVar(value="Ready - Select an Excel file and click 'Analyze Data'")
# status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN)
# status_bar.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(10, 0))
def create_summary_tab(self):
# Summary tab
summary_frame = ttk.Frame(self.notebook)
self.notebook.add(summary_frame, text="Summary")
# def create_summary_tab(self):
# # Summary tab
# summary_frame = ttk.Frame(self.notebook)
# self.notebook.add(summary_frame, text="Summary")
# Configure grid
summary_frame.columnconfigure(0, weight=1)
summary_frame.rowconfigure(1, weight=1)
# # Configure grid
# summary_frame.columnconfigure(0, weight=1)
# summary_frame.rowconfigure(1, weight=1)
# Summary text widget
summary_text_frame = ttk.LabelFrame(summary_frame, text="Comparison Summary", padding="10")
summary_text_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10)
summary_text_frame.columnconfigure(0, weight=1)
summary_text_frame.rowconfigure(0, weight=1)
# # Summary text widget
# summary_text_frame = ttk.LabelFrame(summary_frame, text="Comparison Summary", padding="10")
# summary_text_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10)
# summary_text_frame.columnconfigure(0, weight=1)
# summary_text_frame.rowconfigure(0, weight=1)
self.summary_text = tk.Text(summary_text_frame, wrap=tk.WORD, height=15)
summary_scrollbar = ttk.Scrollbar(summary_text_frame, orient=tk.VERTICAL, command=self.summary_text.yview)
self.summary_text.configure(yscrollcommand=summary_scrollbar.set)
# self.summary_text = tk.Text(summary_text_frame, wrap=tk.WORD, height=15)
# summary_scrollbar = ttk.Scrollbar(summary_text_frame, orient=tk.VERTICAL, command=self.summary_text.yview)
# self.summary_text.configure(yscrollcommand=summary_scrollbar.set)
self.summary_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
summary_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# self.summary_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# summary_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# Reconciliation info
reconcile_frame = ttk.LabelFrame(summary_frame, text="Reconciliation Results", padding="10")
reconcile_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), padx=10, pady=(0, 10))
# # Reconciliation info
# reconcile_frame = ttk.LabelFrame(summary_frame, text="Reconciliation Results", padding="10")
# reconcile_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), padx=10, pady=(0, 10))
self.reconcile_text = tk.Text(reconcile_frame, wrap=tk.WORD, height=8)
reconcile_scrollbar = ttk.Scrollbar(reconcile_frame, orient=tk.VERTICAL, command=self.reconcile_text.yview)
self.reconcile_text.configure(yscrollcommand=reconcile_scrollbar.set)
# self.reconcile_text = tk.Text(reconcile_frame, wrap=tk.WORD, height=8)
# reconcile_scrollbar = ttk.Scrollbar(reconcile_frame, orient=tk.VERTICAL, command=self.reconcile_text.yview)
# self.reconcile_text.configure(yscrollcommand=reconcile_scrollbar.set)
self.reconcile_text.grid(row=0, column=0, sticky=(tk.W, tk.E))
reconcile_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# self.reconcile_text.grid(row=0, column=0, sticky=(tk.W, tk.E))
# reconcile_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
reconcile_frame.columnconfigure(0, weight=1)
# reconcile_frame.columnconfigure(0, weight=1)
def create_matched_tab(self):
matched_frame = ttk.Frame(self.notebook)
self.notebook.add(matched_frame, text="Matched Items")
# def create_matched_tab(self):
# matched_frame = ttk.Frame(self.notebook)
# self.notebook.add(matched_frame, text="Matched Items")
self.create_data_table(matched_frame, "matched")
# self.create_data_table(matched_frame, "matched")
def create_kst_only_tab(self):
kst_frame = ttk.Frame(self.notebook)
self.notebook.add(kst_frame, text="KST Only")
# def create_kst_only_tab(self):
# kst_frame = ttk.Frame(self.notebook)
# self.notebook.add(kst_frame, text="KST Only")
self.create_data_table(kst_frame, "kst_only")
# self.create_data_table(kst_frame, "kst_only")
def create_coordi_only_tab(self):
coordi_frame = ttk.Frame(self.notebook)
self.notebook.add(coordi_frame, text="Coordi Only")
# def create_coordi_only_tab(self):
# coordi_frame = ttk.Frame(self.notebook)
# self.notebook.add(coordi_frame, text="Coordi Only")
self.create_data_table(coordi_frame, "coordi_only")
# self.create_data_table(coordi_frame, "coordi_only")
def create_data_table(self, parent, table_type):
# Configure grid
parent.columnconfigure(0, weight=1)
parent.rowconfigure(1, weight=1)
# def create_data_table(self, parent, table_type):
# # Configure grid
# parent.columnconfigure(0, weight=1)
# parent.rowconfigure(1, weight=1)
# Info label
info_frame = ttk.Frame(parent)
info_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=10, pady=10)
info_frame.columnconfigure(1, weight=1)
# # Info label
# info_frame = ttk.Frame(parent)
# info_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=10, pady=10)
# info_frame.columnconfigure(1, weight=1)
count_label = ttk.Label(info_frame, text="Count:")
count_label.grid(row=0, column=0, padx=(0, 10))
# count_label = ttk.Label(info_frame, text="Count:")
# count_label.grid(row=0, column=0, padx=(0, 10))
count_var = tk.StringVar(value="0")
setattr(self, f"{table_type}_count_var", count_var)
count_display = ttk.Label(info_frame, textvariable=count_var, font=("Arial", 10, "bold"))
count_display.grid(row=0, column=1, sticky=tk.W)
# count_var = tk.StringVar(value="0")
# setattr(self, f"{table_type}_count_var", count_var)
# count_display = ttk.Label(info_frame, textvariable=count_var, font=("Arial", 10, "bold"))
# count_display.grid(row=0, column=1, sticky=tk.W)
# Table frame
table_frame = ttk.Frame(parent)
table_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=(0, 10))
table_frame.columnconfigure(0, weight=1)
table_frame.rowconfigure(0, weight=1)
# # Table frame
# table_frame = ttk.Frame(parent)
# table_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=(0, 10))
# table_frame.columnconfigure(0, weight=1)
# table_frame.rowconfigure(0, weight=1)
# Create treeview
columns = ("Title", "Episode", "Sheet", "Row", "Reason")
tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20)
# # Create treeview
# columns = ("Title", "Episode", "Sheet", "Row", "Reason")
# tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20)
# Configure columns
tree.heading("Title", text="Title")
tree.heading("Episode", text="Episode")
tree.heading("Sheet", text="Sheet")
tree.heading("Row", text="Row")
tree.heading("Reason", text="Reason")
# # Configure columns
# tree.heading("Title", text="Title")
# tree.heading("Episode", text="Episode")
# tree.heading("Sheet", text="Sheet")
# tree.heading("Row", text="Row")
# tree.heading("Reason", text="Reason")
tree.column("Title", width=300)
tree.column("Episode", width=100)
tree.column("Sheet", width=120)
tree.column("Row", width=80)
tree.column("Reason", width=300)
# tree.column("Title", width=300)
# tree.column("Episode", width=100)
# tree.column("Sheet", width=120)
# tree.column("Row", width=80)
# tree.column("Reason", width=300)
# Scrollbars
v_scrollbar = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=tree.yview)
h_scrollbar = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=tree.xview)
tree.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set)
# # Scrollbars
# v_scrollbar = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=tree.yview)
# h_scrollbar = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=tree.xview)
# tree.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set)
tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
v_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
h_scrollbar.grid(row=1, column=0, sticky=(tk.W, tk.E))
# tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# v_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# h_scrollbar.grid(row=1, column=0, sticky=(tk.W, tk.E))
# Store tree widget
setattr(self, f"{table_type}_tree", tree)
# # Store tree widget
# setattr(self, f"{table_type}_tree", tree)
def browse_file(self):
file_path = filedialog.askopenfilename(
title="Select Excel File",
filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")]
)
if file_path:
self.file_path_var.set(file_path)
# def browse_file(self):
# file_path = filedialog.askopenfilename(
# title="Select Excel File",
# filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")]
# )
# if file_path:
# self.file_path_var.set(file_path)
def analyze_data(self):
file_path = self.file_path_var.get().strip()
# def analyze_data(self):
# file_path = self.file_path_var.get().strip()
if not file_path:
messagebox.showerror("Error", "Please select an Excel file")
return
# if not file_path:
# messagebox.showerror("Error", "Please select an Excel file")
# return
if not Path(file_path).exists():
messagebox.showerror("Error", f"File not found: {file_path}")
return
# if not Path(file_path).exists():
# messagebox.showerror("Error", f"File not found: {file_path}")
# return
try:
self.status_var.set("Analyzing data...")
self.root.update()
# try:
# self.status_var.set("Analyzing data...")
# self.root.update()
# Create comparator and analyze
self.comparator = KSTCoordiComparator(file_path)
if not self.comparator.load_data():
messagebox.showerror("Error", "Failed to load Excel data")
return
# # Create comparator and analyze
# self.comparator = KSTCoordiComparator(file_path)
# if not self.comparator.load_data():
# messagebox.showerror("Error", "Failed to load Excel data")
# return
# Get comparison results
self.comparison_data = self.comparator.get_comparison_summary()
# # Get comparison results
# self.comparison_data = self.comparator.get_comparison_summary()
# Update GUI
self.update_summary()
self.update_data_tables()
# # Update GUI
# self.update_summary()
# self.update_data_tables()
self.status_var.set("Analysis complete!")
# self.status_var.set("Analysis complete!")
except Exception as e:
messagebox.showerror("Error", f"Analysis failed: {str(e)}")
self.status_var.set("Analysis failed")
# except Exception as e:
# messagebox.showerror("Error", f"Analysis failed: {str(e)}")
# self.status_var.set("Analysis failed")
def update_summary(self):
if not self.comparison_data:
return
# def update_summary(self):
# if not self.comparison_data:
# return
# Clear previous content
self.summary_text.delete(1.0, tk.END)
self.reconcile_text.delete(1.0, tk.END)
# # Clear previous content
# self.summary_text.delete(1.0, tk.END)
# self.reconcile_text.delete(1.0, tk.END)
data = self.comparison_data
# data = self.comparison_data
# Summary text
summary = f"""COMPARISON SUMMARY
{'='*50}
# # Summary text
# summary = f"""COMPARISON SUMMARY
# {'='*50}
Original Counts:
KST Total: {data['original_counts']['kst_total']:,}
Coordi Total: {data['original_counts']['coordi_total']:,}
# Original Counts:
# KST Total: {data['original_counts']['kst_total']:,}
# Coordi Total: {data['original_counts']['coordi_total']:,}
Matched Items: {data['matched_items_count']:,}
# Matched Items: {data['matched_items_count']:,}
Mismatches:
KST Only: {data['mismatches']['kst_only_count']:,}
Coordi Only: {data['mismatches']['coordi_only_count']:,}
KST Duplicates: {data['mismatches']['kst_duplicates_count']:,}
Coordi Duplicates: {data['mismatches']['coordi_duplicates_count']:,}
# Mismatches:
# KST Only: {data['mismatches']['kst_only_count']:,}
# Coordi Only: {data['mismatches']['coordi_only_count']:,}
# KST Duplicates: {data['mismatches']['kst_duplicates_count']:,}
# Coordi Duplicates: {data['mismatches']['coordi_duplicates_count']:,}
Total Mismatches: {data['mismatches']['kst_only_count'] + data['mismatches']['coordi_only_count'] + data['mismatches']['kst_duplicates_count'] + data['mismatches']['coordi_duplicates_count']:,}
"""
# Total Mismatches: {data['mismatches']['kst_only_count'] + data['mismatches']['coordi_only_count'] + data['mismatches']['kst_duplicates_count'] + data['mismatches']['coordi_duplicates_count']:,}
# """
self.summary_text.insert(tk.END, summary)
# self.summary_text.insert(tk.END, summary)
# Reconciliation text
reconcile = data['reconciliation']
reconcile_info = f"""RECONCILIATION RESULTS
{'='*40}
# # Reconciliation text
# reconcile = data['reconciliation']
# reconcile_info = f"""RECONCILIATION RESULTS
# {'='*40}
After excluding mismatches:
KST Count: {reconcile['reconciled_kst_count']:,}
Coordi Count: {reconcile['reconciled_coordi_count']:,}
Counts Match: {'✅ YES' if reconcile['counts_match_after_reconciliation'] else '❌ NO'}
# After excluding mismatches:
# KST Count: {reconcile['reconciled_kst_count']:,}
# Coordi Count: {reconcile['reconciled_coordi_count']:,}
# Counts Match: {'✅ YES' if reconcile['counts_match_after_reconciliation'] else '❌ NO'}
Items to exclude:
From KST: {reconcile['items_to_exclude_from_kst']:,}
From Coordi: {reconcile['items_to_exclude_from_coordi']:,}
# Items to exclude:
# From KST: {reconcile['items_to_exclude_from_kst']:,}
# From Coordi: {reconcile['items_to_exclude_from_coordi']:,}
Final Result: Both datasets will have {reconcile['reconciled_kst_count']:,} matching items after reconciliation.
"""
# Final Result: Both datasets will have {reconcile['reconciled_kst_count']:,} matching items after reconciliation.
# """
self.reconcile_text.insert(tk.END, reconcile_info)
# self.reconcile_text.insert(tk.END, reconcile_info)
def update_data_tables(self):
if not self.comparison_data:
return
# def update_data_tables(self):
# if not self.comparison_data:
# return
mismatches = self.comparison_data['mismatch_details']
# mismatches = self.comparison_data['mismatch_details']
# Update matched items (create from intersection)
matched_count = self.comparison_data['matched_items_count']
self.matched_count_var.set(f"{matched_count:,}")
# # Update matched items (create from intersection)
# matched_count = self.comparison_data['matched_items_count']
# self.matched_count_var.set(f"{matched_count:,}")
# Clear matched tree
for item in self.matched_tree.get_children():
self.matched_tree.delete(item)
# # Clear matched tree
# for item in self.matched_tree.get_children():
# self.matched_tree.delete(item)
# Add matched items (we'll show the first few as examples)
if self.comparator:
categorization = self.comparator.categorize_mismatches()
matched_items = categorization['matched_items']
for i, item in enumerate(list(matched_items)[:100]): # Show first 100
self.matched_tree.insert("", tk.END, values=(
item.title, item.episode, item.source_sheet, item.row_index + 1, "Perfect match"
))
# # Add matched items (we'll show the first few as examples)
# if self.comparator:
# categorization = self.comparator.categorize_mismatches()
# matched_items = categorization['matched_items']
# for i, item in enumerate(list(matched_items)[:100]): # Show first 100
# self.matched_tree.insert("", tk.END, values=(
# item.title, item.episode, item.source_sheet, item.row_index + 1, "Perfect match"
# ))
# Update KST only
kst_only = mismatches['kst_only']
self.kst_only_count_var.set(f"{len(kst_only):,}")
# # Update KST only
# kst_only = mismatches['kst_only']
# self.kst_only_count_var.set(f"{len(kst_only):,}")
for item in self.kst_only_tree.get_children():
self.kst_only_tree.delete(item)
# for item in self.kst_only_tree.get_children():
# self.kst_only_tree.delete(item)
for mismatch in kst_only:
self.kst_only_tree.insert("", tk.END, values=(
mismatch['title'], mismatch['episode'], mismatch['sheet'],
mismatch['row_index'] + 1, mismatch['reason']
))
# for mismatch in kst_only:
# self.kst_only_tree.insert("", tk.END, values=(
# mismatch['title'], mismatch['episode'], mismatch['sheet'],
# mismatch['row_index'] + 1, mismatch['reason']
# ))
# Update Coordi only
coordi_only = mismatches['coordi_only']
self.coordi_only_count_var.set(f"{len(coordi_only):,}")
# # Update Coordi only
# coordi_only = mismatches['coordi_only']
# self.coordi_only_count_var.set(f"{len(coordi_only):,}")
for item in self.coordi_only_tree.get_children():
self.coordi_only_tree.delete(item)
# for item in self.coordi_only_tree.get_children():
# self.coordi_only_tree.delete(item)
for mismatch in coordi_only:
self.coordi_only_tree.insert("", tk.END, values=(
mismatch['title'], mismatch['episode'], mismatch['sheet'],
mismatch['row_index'] + 1, mismatch['reason']
))
# for mismatch in coordi_only:
# self.coordi_only_tree.insert("", tk.END, values=(
# mismatch['title'], mismatch['episode'], mismatch['sheet'],
# mismatch['row_index'] + 1, mismatch['reason']
# ))
def main():
root = tk.Tk()
app = DataComparisonGUI(root)
root.mainloop()
# def main():
# root = tk.Tk()
# app = DataComparisonGUI(root)
# root.mainloop()
if __name__ == "__main__":
main()
# if __name__ == "__main__":
# main()

View File

@ -404,6 +404,7 @@
// Update count displays
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
// Count all different items including duplicates
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count;
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
@ -443,50 +444,44 @@
const tbody = document.getElementById('different-table');
tbody.innerHTML = '';
// Combine all mismatches into one array for sorting
// Create sets of duplicate items for highlighting
const kstDuplicateKeys = new Set();
const coordiDuplicateKeys = new Set();
mismatchDetails.kst_duplicates.forEach(item => {
kstDuplicateKeys.add(`${item.title}_${item.episode}`);
});
mismatchDetails.coordi_duplicates.forEach(item => {
coordiDuplicateKeys.add(`${item.title}_${item.episode}`);
});
// Combine only KST-only and Coordi-only items (like before)
const allDifferences = [];
// Add KST-only items
mismatchDetails.kst_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '',
reason: 'Only appears in KST',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
sortEpisode: parseFloat(item.episode) || 0,
isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate
});
});
// Add Coordi-only items
mismatchDetails.coordi_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({
kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Only appears in Coordi',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
});
});
// Add KST duplicates
mismatchDetails.kst_duplicates.forEach(item => {
allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '',
reason: 'Duplicate in KST',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
});
});
// Add Coordi duplicates
mismatchDetails.coordi_duplicates.forEach(item => {
allDifferences.push({
kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Duplicate in Coordi',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
sortEpisode: parseFloat(item.episode) || 0,
isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate
});
});
@ -497,12 +492,18 @@
return a.sortEpisode - b.sortEpisode;
});
// Populate table
// Populate table with highlighting
allDifferences.forEach(diff => {
const row = tbody.insertRow();
row.insertCell(0).textContent = diff.kstData;
row.insertCell(1).textContent = diff.coordiData;
row.insertCell(2).textContent = diff.reason;
// Highlight row in yellow if it's also a duplicate
if (diff.isDuplicate) {
row.style.backgroundColor = '#fff3cd'; // Light yellow
row.title = 'This item also has duplicates in the dataset';
}
});
}

64
test_duplicates.py Normal file
View File

@ -0,0 +1,64 @@
from data_comparator import KSTCoordiComparator
def test_duplicate_detection():
comparator = KSTCoordiComparator('data/sample-data.xlsx')
if comparator.load_data():
print("=== DUPLICATE DETECTION TEST ===")
# Get the data extraction results
data = comparator.extract_kst_coordi_items()
print(f"Total KST items (unique): {len(data['kst_items'])}")
print(f"Total KST items (all): {len(data['kst_all_items'])}")
print(f"Total Coordi items (unique): {len(data['coordi_items'])}")
print(f"Total Coordi items (all): {len(data['coordi_all_items'])}")
# Check for duplicates
categorization = comparator.categorize_mismatches()
print(f"\nKST duplicates found: {len(categorization['kst_duplicates'])}")
print(f"Coordi duplicates found: {len(categorization['coordi_duplicates'])}")
# Show sample duplicates
if categorization['kst_duplicates']:
print("\nSample KST duplicates:")
for i, dup in enumerate(categorization['kst_duplicates'][:3]):
print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})")
if categorization['coordi_duplicates']:
print("\nSample Coordi duplicates:")
for i, dup in enumerate(categorization['coordi_duplicates'][:3]):
print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})")
# Check for the specific example: 백라이트 - Episode 53-1x(휴재)
mismatch_details = comparator.generate_mismatch_details()
print(f"\nLooking for '백라이트 - Episode 53-1x(휴재)':")
# Check in KST-only
backlight_kst_only = [item for item in mismatch_details['kst_only']
if '백라이트' in item['title'] and '53-1x' in item['episode']]
# Check in KST duplicates
backlight_kst_dup = [item for item in mismatch_details['kst_duplicates']
if '백라이트' in item['title'] and '53-1x' in item['episode']]
print(f" Found in KST-only: {len(backlight_kst_only)}")
print(f" Found in KST duplicates: {len(backlight_kst_dup)}")
if backlight_kst_only:
print(f" KST-only details: {backlight_kst_only[0]}")
if backlight_kst_dup:
print(f" KST duplicate details: {backlight_kst_dup[0]}")
# Test the web interface logic
print(f"\n=== Testing Web Interface Logic ===")
summary = comparator.get_comparison_summary()
print(f"Web interface will show:")
print(f" Total different items: {summary['mismatches']['kst_only_count'] + summary['mismatches']['coordi_only_count'] + summary['mismatches']['kst_duplicates_count'] + summary['mismatches']['coordi_duplicates_count']}")
print("\n✓ Duplicate detection test complete!")
print("✓ Check the web interface at http://localhost:8080 to see combined reasons")
if __name__ == "__main__":
test_duplicate_detection()

View File

@ -0,0 +1,52 @@
import requests
def test_final_duplicate_fix():
print("=== FINAL DUPLICATE FIX TEST ===")
try:
# Test the analyze endpoint
response = requests.post('http://localhost:8081/analyze',
json={'file_path': 'data/sample-data.xlsx'},
timeout=30)
if response.status_code == 200:
data = response.json()
if data.get('success'):
results = data['results']
print("✓ Analysis successful!")
print(f" Matched items: {results['matched_items_count']}")
print(f" KST only: {results['mismatches']['kst_only_count']}")
print(f" Coordi only: {results['mismatches']['coordi_only_count']}")
print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}")
print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}")
total_different = (results['mismatches']['kst_only_count'] +
results['mismatches']['coordi_only_count'] +
results['mismatches']['kst_duplicates_count'] +
results['mismatches']['coordi_duplicates_count'])
print(f" Total different items: {total_different}")
# Check for the specific example
kst_duplicates = results['mismatch_details']['kst_duplicates']
backlight_duplicates = [item for item in kst_duplicates
if '백라이트' in item['title'] and '53-1x' in item['episode']]
if backlight_duplicates:
print(f"\n✓ Found 백라이트 duplicates: {len(backlight_duplicates)}")
print(f" Example: {backlight_duplicates[0]['title']} - Episode {backlight_duplicates[0]['episode']}")
print(f"\n✓ Web interface ready at http://localhost:8081")
print("✓ The 'Different' tab will now show combined reasons like:")
print(" 백라이트 - Episode 53-1x(휴재) | (empty) | Only appears in KST + Duplicate in KST")
else:
print(f"✗ Analysis failed: {data.get('error')}")
else:
print(f"✗ Request failed: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Request failed: {e}")
if __name__ == "__main__":
test_final_duplicate_fix()

View File

@ -0,0 +1,68 @@
import requests
def test_simplified_duplicates():
print("=== SIMPLIFIED DUPLICATE DISPLAY TEST ===")
try:
# Test the analyze endpoint
response = requests.post('http://localhost:8081/analyze',
json={'file_path': 'data/sample-data.xlsx'},
timeout=30)
if response.status_code == 200:
data = response.json()
if data.get('success'):
results = data['results']
print("✓ Analysis successful!")
print(f" Matched items: {results['matched_items_count']}")
print(f" KST only: {results['mismatches']['kst_only_count']}")
print(f" Coordi only: {results['mismatches']['coordi_only_count']}")
print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}")
print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}")
# What the count will show
total_count = (results['mismatches']['kst_only_count'] +
results['mismatches']['coordi_only_count'] +
results['mismatches']['kst_duplicates_count'] +
results['mismatches']['coordi_duplicates_count'])
# What the table will show
table_rows = results['mismatches']['kst_only_count'] + results['mismatches']['coordi_only_count']
print(f"\n📊 DISPLAY LOGIC:")
print(f" Count badge shows: {total_count} items (all different items)")
print(f" Table shows: {table_rows} rows (only KST-only + Coordi-only)")
print(f" Yellow highlights: Items that are also duplicates")
# Check for 백라이트 example
kst_only = results['mismatch_details']['kst_only']
kst_duplicates = results['mismatch_details']['kst_duplicates']
backlight_kst_only = [item for item in kst_only
if '백라이트' in item['title'] and '53-1x' in item['episode']]
backlight_kst_dup = [item for item in kst_duplicates
if '백라이트' in item['title'] and '53-1x' in item['episode']]
if backlight_kst_only and backlight_kst_dup:
print(f"\n✓ 백라이트 example works:")
print(f" - Appears in table (KST-only): YES")
print(f" - Will be highlighted yellow: YES (also duplicate)")
print(f" - Contributes to count: 2 items (1 KST-only + 1 duplicate)")
print(f"\n✓ Web interface ready at http://localhost:8081")
print("✓ Check the 'Different' tab:")
print(" - Count shows all different items")
print(" - Table shows only KST-only + Coordi-only")
print(" - Yellow rows = items that also have duplicates")
else:
print(f"✗ Analysis failed: {data.get('error')}")
else:
print(f"✗ Request failed: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Request failed: {e}")
if __name__ == "__main__":
test_simplified_duplicates()

View File

@ -512,6 +512,7 @@ def create_templates_dir():
// Update count displays
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
// Count all different items including duplicates
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count;
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
@ -551,50 +552,44 @@ def create_templates_dir():
const tbody = document.getElementById('different-table');
tbody.innerHTML = '';
// Combine all mismatches into one array for sorting
// Create sets of duplicate items for highlighting
const kstDuplicateKeys = new Set();
const coordiDuplicateKeys = new Set();
mismatchDetails.kst_duplicates.forEach(item => {
kstDuplicateKeys.add(`${item.title}_${item.episode}`);
});
mismatchDetails.coordi_duplicates.forEach(item => {
coordiDuplicateKeys.add(`${item.title}_${item.episode}`);
});
// Combine only KST-only and Coordi-only items (like before)
const allDifferences = [];
// Add KST-only items
mismatchDetails.kst_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '',
reason: 'Only appears in KST',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
sortEpisode: parseFloat(item.episode) || 0,
isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate
});
});
// Add Coordi-only items
mismatchDetails.coordi_only.forEach(item => {
const key = `${item.title}_${item.episode}`;
allDifferences.push({
kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Only appears in Coordi',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
});
});
// Add KST duplicates
mismatchDetails.kst_duplicates.forEach(item => {
allDifferences.push({
kstData: `${item.title} - Episode ${item.episode}`,
coordiData: '',
reason: 'Duplicate in KST',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
});
});
// Add Coordi duplicates
mismatchDetails.coordi_duplicates.forEach(item => {
allDifferences.push({
kstData: '',
coordiData: `${item.title} - Episode ${item.episode}`,
reason: 'Duplicate in Coordi',
sortTitle: item.title,
sortEpisode: parseFloat(item.episode) || 0
sortEpisode: parseFloat(item.episode) || 0,
isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate
});
});
@ -605,12 +600,18 @@ def create_templates_dir():
return a.sortEpisode - b.sortEpisode;
});
// Populate table
// Populate table with highlighting
allDifferences.forEach(diff => {
const row = tbody.insertRow();
row.insertCell(0).textContent = diff.kstData;
row.insertCell(1).textContent = diff.coordiData;
row.insertCell(2).textContent = diff.reason;
// Highlight row in yellow if it's also a duplicate
if (diff.isDuplicate) {
row.style.backgroundColor = '#fff3cd'; // Light yellow
row.title = 'This item also has duplicates in the dataset';
}
});
}
@ -630,8 +631,8 @@ def main():
create_templates_dir()
print("Starting web-based GUI...")
print("Open your browser and go to: http://localhost:8080")
app.run(debug=True, host='0.0.0.0', port=8080)
print("Open your browser and go to: http://localhost:8081")
app.run(debug=True, host='0.0.0.0', port=8081)
if __name__ == "__main__":
main()