From 1f88db5fb99a1a3e898349ea193f08d1cd934ff4 Mon Sep 17 00:00:00 2001 From: arthur Date: Wed, 20 Aug 2025 14:58:30 +0700 Subject: [PATCH] compare data logic --- data_comparator.py | 20 +- gui_app.py | 484 +++++++++++++++++----------------- templates/index.html | 53 ++-- test_duplicates.py | 64 +++++ test_final_duplicate_fix.py | 52 ++++ test_simplified_duplicates.py | 68 +++++ web_gui.py | 57 ++-- 7 files changed, 495 insertions(+), 303 deletions(-) create mode 100644 test_duplicates.py create mode 100644 test_final_duplicate_fix.py create mode 100644 test_simplified_duplicates.py diff --git a/data_comparator.py b/data_comparator.py index 06cfd23..5bc2440 100644 --- a/data_comparator.py +++ b/data_comparator.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from typing import Dict, List, Tuple, Any, Set +from typing import Dict, List, Tuple, Any from dataclasses import dataclass @dataclass @@ -48,6 +48,8 @@ class KSTCoordiComparator: coordi_items = set() kst_details = [] coordi_details = [] + kst_all_items = [] # Keep all items including duplicates + coordi_all_items = [] # Keep all items including duplicates for sheet_name, df in self.data.items(): columns = df.columns.tolist() @@ -96,6 +98,7 @@ class KSTCoordiComparator: if has_kst_data: item = ComparisonItem(kst_title, kst_episode, sheet_name, idx) kst_items.add(item) + kst_all_items.append(item) # Keep all items for duplicate detection kst_details.append({ 'title': kst_title, 'episode': kst_episode, @@ -122,6 +125,7 @@ class KSTCoordiComparator: if has_coordi_data: item = ComparisonItem(coordi_title, coordi_episode, sheet_name, idx) coordi_items.add(item) + coordi_all_items.append(item) # Keep all items for duplicate detection coordi_details.append({ 'title': coordi_title, 'episode': coordi_episode, @@ -135,12 +139,16 @@ class KSTCoordiComparator: self.kst_items = kst_items self.coordi_items = coordi_items + self.kst_all_items = kst_all_items # Store for duplicate detection + self.coordi_all_items = coordi_all_items # Store for duplicate detection return { 'kst_items': kst_items, 'coordi_items': coordi_items, 'kst_details': kst_details, - 'coordi_details': coordi_details + 'coordi_details': coordi_details, + 'kst_all_items': kst_all_items, + 'coordi_all_items': coordi_all_items } def categorize_mismatches(self) -> Dict[str, Any]: @@ -154,8 +162,8 @@ class KSTCoordiComparator: coordi_only_items = self.coordi_items - self.kst_items # Find duplicates within each dataset - kst_duplicates = self._find_duplicates_in_set(self.kst_items) - coordi_duplicates = self._find_duplicates_in_set(self.coordi_items) + kst_duplicates = self._find_duplicates_in_list(self.kst_all_items) + coordi_duplicates = self._find_duplicates_in_list(self.coordi_all_items) categorization = { 'matched_items': list(matched_items), @@ -190,10 +198,8 @@ class KSTCoordiComparator: return categorization - def _find_duplicates_in_set(self, items_set: Set[ComparisonItem]) -> List[ComparisonItem]: + def _find_duplicates_in_list(self, items_list: List[ComparisonItem]) -> List[ComparisonItem]: """Find duplicate items within a dataset""" - # Convert to list to check for duplicates - items_list = list(items_set) seen = set() duplicates = [] diff --git a/gui_app.py b/gui_app.py index b3dd952..402075d 100644 --- a/gui_app.py +++ b/gui_app.py @@ -1,319 +1,319 @@ -import tkinter as tk -from tkinter import ttk, filedialog, messagebox -import pandas as pd -from pathlib import Path -from data_comparator import KSTCoordiComparator +# import tkinter as tk +# from tkinter import ttk, filedialog, messagebox +# import pandas as pd +# from pathlib import Path +# from data_comparator import KSTCoordiComparator -class DataComparisonGUI: - def __init__(self, root): - self.root = root - self.root.title("KST vs Coordi Data Comparison Tool") - self.root.geometry("1200x800") +# class DataComparisonGUI: +# def __init__(self, root): +# self.root = root +# self.root.title("KST vs Coordi Data Comparison Tool") +# self.root.geometry("1200x800") - self.comparator = None - self.comparison_data = None +# self.comparator = None +# self.comparison_data = None - self.setup_ui() +# self.setup_ui() - def setup_ui(self): - # Main container - main_frame = ttk.Frame(self.root, padding="10") - main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) +# def setup_ui(self): +# # Main container +# main_frame = ttk.Frame(self.root, padding="10") +# main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) - # Configure grid weights - self.root.columnconfigure(0, weight=1) - self.root.rowconfigure(0, weight=1) - main_frame.columnconfigure(1, weight=1) - main_frame.rowconfigure(2, weight=1) +# # Configure grid weights +# self.root.columnconfigure(0, weight=1) +# self.root.rowconfigure(0, weight=1) +# main_frame.columnconfigure(1, weight=1) +# main_frame.rowconfigure(2, weight=1) - # Title - title_label = ttk.Label(main_frame, text="KST vs Coordi Data Comparison", - font=("Arial", 16, "bold")) - title_label.grid(row=0, column=0, columnspan=3, pady=(0, 20)) +# # Title +# title_label = ttk.Label(main_frame, text="KST vs Coordi Data Comparison", +# font=("Arial", 16, "bold")) +# title_label.grid(row=0, column=0, columnspan=3, pady=(0, 20)) - # File selection frame - file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10") - file_frame.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 10)) - file_frame.columnconfigure(1, weight=1) +# # File selection frame +# file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10") +# file_frame.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 10)) +# file_frame.columnconfigure(1, weight=1) - ttk.Label(file_frame, text="Excel File:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10)) +# ttk.Label(file_frame, text="Excel File:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10)) - self.file_path_var = tk.StringVar(value="data/sample-data.xlsx") - self.file_entry = ttk.Entry(file_frame, textvariable=self.file_path_var, width=50) - self.file_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10)) +# self.file_path_var = tk.StringVar(value="data/sample-data.xlsx") +# self.file_entry = ttk.Entry(file_frame, textvariable=self.file_path_var, width=50) +# self.file_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10)) - browse_btn = ttk.Button(file_frame, text="Browse", command=self.browse_file) - browse_btn.grid(row=0, column=2) +# browse_btn = ttk.Button(file_frame, text="Browse", command=self.browse_file) +# browse_btn.grid(row=0, column=2) - analyze_btn = ttk.Button(file_frame, text="Analyze Data", command=self.analyze_data) - analyze_btn.grid(row=0, column=3, padx=(10, 0)) +# analyze_btn = ttk.Button(file_frame, text="Analyze Data", command=self.analyze_data) +# analyze_btn.grid(row=0, column=3, padx=(10, 0)) - # Results notebook (tabs) - self.notebook = ttk.Notebook(main_frame) - self.notebook.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S)) +# # Results notebook (tabs) +# self.notebook = ttk.Notebook(main_frame) +# self.notebook.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S)) - # Create tabs - self.create_summary_tab() - self.create_matched_tab() - self.create_kst_only_tab() - self.create_coordi_only_tab() +# # Create tabs +# self.create_summary_tab() +# self.create_matched_tab() +# self.create_kst_only_tab() +# self.create_coordi_only_tab() - # Status bar - self.status_var = tk.StringVar(value="Ready - Select an Excel file and click 'Analyze Data'") - status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN) - status_bar.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(10, 0)) +# # Status bar +# self.status_var = tk.StringVar(value="Ready - Select an Excel file and click 'Analyze Data'") +# status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN) +# status_bar.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(10, 0)) - def create_summary_tab(self): - # Summary tab - summary_frame = ttk.Frame(self.notebook) - self.notebook.add(summary_frame, text="Summary") +# def create_summary_tab(self): +# # Summary tab +# summary_frame = ttk.Frame(self.notebook) +# self.notebook.add(summary_frame, text="Summary") - # Configure grid - summary_frame.columnconfigure(0, weight=1) - summary_frame.rowconfigure(1, weight=1) +# # Configure grid +# summary_frame.columnconfigure(0, weight=1) +# summary_frame.rowconfigure(1, weight=1) - # Summary text widget - summary_text_frame = ttk.LabelFrame(summary_frame, text="Comparison Summary", padding="10") - summary_text_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10) - summary_text_frame.columnconfigure(0, weight=1) - summary_text_frame.rowconfigure(0, weight=1) +# # Summary text widget +# summary_text_frame = ttk.LabelFrame(summary_frame, text="Comparison Summary", padding="10") +# summary_text_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10) +# summary_text_frame.columnconfigure(0, weight=1) +# summary_text_frame.rowconfigure(0, weight=1) - self.summary_text = tk.Text(summary_text_frame, wrap=tk.WORD, height=15) - summary_scrollbar = ttk.Scrollbar(summary_text_frame, orient=tk.VERTICAL, command=self.summary_text.yview) - self.summary_text.configure(yscrollcommand=summary_scrollbar.set) +# self.summary_text = tk.Text(summary_text_frame, wrap=tk.WORD, height=15) +# summary_scrollbar = ttk.Scrollbar(summary_text_frame, orient=tk.VERTICAL, command=self.summary_text.yview) +# self.summary_text.configure(yscrollcommand=summary_scrollbar.set) - self.summary_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) - summary_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) +# self.summary_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) +# summary_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) - # Reconciliation info - reconcile_frame = ttk.LabelFrame(summary_frame, text="Reconciliation Results", padding="10") - reconcile_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), padx=10, pady=(0, 10)) +# # Reconciliation info +# reconcile_frame = ttk.LabelFrame(summary_frame, text="Reconciliation Results", padding="10") +# reconcile_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), padx=10, pady=(0, 10)) - self.reconcile_text = tk.Text(reconcile_frame, wrap=tk.WORD, height=8) - reconcile_scrollbar = ttk.Scrollbar(reconcile_frame, orient=tk.VERTICAL, command=self.reconcile_text.yview) - self.reconcile_text.configure(yscrollcommand=reconcile_scrollbar.set) +# self.reconcile_text = tk.Text(reconcile_frame, wrap=tk.WORD, height=8) +# reconcile_scrollbar = ttk.Scrollbar(reconcile_frame, orient=tk.VERTICAL, command=self.reconcile_text.yview) +# self.reconcile_text.configure(yscrollcommand=reconcile_scrollbar.set) - self.reconcile_text.grid(row=0, column=0, sticky=(tk.W, tk.E)) - reconcile_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) +# self.reconcile_text.grid(row=0, column=0, sticky=(tk.W, tk.E)) +# reconcile_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) - reconcile_frame.columnconfigure(0, weight=1) +# reconcile_frame.columnconfigure(0, weight=1) - def create_matched_tab(self): - matched_frame = ttk.Frame(self.notebook) - self.notebook.add(matched_frame, text="Matched Items") +# def create_matched_tab(self): +# matched_frame = ttk.Frame(self.notebook) +# self.notebook.add(matched_frame, text="Matched Items") - self.create_data_table(matched_frame, "matched") +# self.create_data_table(matched_frame, "matched") - def create_kst_only_tab(self): - kst_frame = ttk.Frame(self.notebook) - self.notebook.add(kst_frame, text="KST Only") +# def create_kst_only_tab(self): +# kst_frame = ttk.Frame(self.notebook) +# self.notebook.add(kst_frame, text="KST Only") - self.create_data_table(kst_frame, "kst_only") +# self.create_data_table(kst_frame, "kst_only") - def create_coordi_only_tab(self): - coordi_frame = ttk.Frame(self.notebook) - self.notebook.add(coordi_frame, text="Coordi Only") +# def create_coordi_only_tab(self): +# coordi_frame = ttk.Frame(self.notebook) +# self.notebook.add(coordi_frame, text="Coordi Only") - self.create_data_table(coordi_frame, "coordi_only") +# self.create_data_table(coordi_frame, "coordi_only") - def create_data_table(self, parent, table_type): - # Configure grid - parent.columnconfigure(0, weight=1) - parent.rowconfigure(1, weight=1) +# def create_data_table(self, parent, table_type): +# # Configure grid +# parent.columnconfigure(0, weight=1) +# parent.rowconfigure(1, weight=1) - # Info label - info_frame = ttk.Frame(parent) - info_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=10, pady=10) - info_frame.columnconfigure(1, weight=1) +# # Info label +# info_frame = ttk.Frame(parent) +# info_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=10, pady=10) +# info_frame.columnconfigure(1, weight=1) - count_label = ttk.Label(info_frame, text="Count:") - count_label.grid(row=0, column=0, padx=(0, 10)) +# count_label = ttk.Label(info_frame, text="Count:") +# count_label.grid(row=0, column=0, padx=(0, 10)) - count_var = tk.StringVar(value="0") - setattr(self, f"{table_type}_count_var", count_var) - count_display = ttk.Label(info_frame, textvariable=count_var, font=("Arial", 10, "bold")) - count_display.grid(row=0, column=1, sticky=tk.W) +# count_var = tk.StringVar(value="0") +# setattr(self, f"{table_type}_count_var", count_var) +# count_display = ttk.Label(info_frame, textvariable=count_var, font=("Arial", 10, "bold")) +# count_display.grid(row=0, column=1, sticky=tk.W) - # Table frame - table_frame = ttk.Frame(parent) - table_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=(0, 10)) - table_frame.columnconfigure(0, weight=1) - table_frame.rowconfigure(0, weight=1) +# # Table frame +# table_frame = ttk.Frame(parent) +# table_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=(0, 10)) +# table_frame.columnconfigure(0, weight=1) +# table_frame.rowconfigure(0, weight=1) - # Create treeview - columns = ("Title", "Episode", "Sheet", "Row", "Reason") - tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20) +# # Create treeview +# columns = ("Title", "Episode", "Sheet", "Row", "Reason") +# tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20) - # Configure columns - tree.heading("Title", text="Title") - tree.heading("Episode", text="Episode") - tree.heading("Sheet", text="Sheet") - tree.heading("Row", text="Row") - tree.heading("Reason", text="Reason") +# # Configure columns +# tree.heading("Title", text="Title") +# tree.heading("Episode", text="Episode") +# tree.heading("Sheet", text="Sheet") +# tree.heading("Row", text="Row") +# tree.heading("Reason", text="Reason") - tree.column("Title", width=300) - tree.column("Episode", width=100) - tree.column("Sheet", width=120) - tree.column("Row", width=80) - tree.column("Reason", width=300) +# tree.column("Title", width=300) +# tree.column("Episode", width=100) +# tree.column("Sheet", width=120) +# tree.column("Row", width=80) +# tree.column("Reason", width=300) - # Scrollbars - v_scrollbar = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=tree.yview) - h_scrollbar = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=tree.xview) - tree.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set) +# # Scrollbars +# v_scrollbar = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=tree.yview) +# h_scrollbar = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=tree.xview) +# tree.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set) - tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) - v_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) - h_scrollbar.grid(row=1, column=0, sticky=(tk.W, tk.E)) +# tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) +# v_scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) +# h_scrollbar.grid(row=1, column=0, sticky=(tk.W, tk.E)) - # Store tree widget - setattr(self, f"{table_type}_tree", tree) +# # Store tree widget +# setattr(self, f"{table_type}_tree", tree) - def browse_file(self): - file_path = filedialog.askopenfilename( - title="Select Excel File", - filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")] - ) - if file_path: - self.file_path_var.set(file_path) +# def browse_file(self): +# file_path = filedialog.askopenfilename( +# title="Select Excel File", +# filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")] +# ) +# if file_path: +# self.file_path_var.set(file_path) - def analyze_data(self): - file_path = self.file_path_var.get().strip() +# def analyze_data(self): +# file_path = self.file_path_var.get().strip() - if not file_path: - messagebox.showerror("Error", "Please select an Excel file") - return +# if not file_path: +# messagebox.showerror("Error", "Please select an Excel file") +# return - if not Path(file_path).exists(): - messagebox.showerror("Error", f"File not found: {file_path}") - return +# if not Path(file_path).exists(): +# messagebox.showerror("Error", f"File not found: {file_path}") +# return - try: - self.status_var.set("Analyzing data...") - self.root.update() +# try: +# self.status_var.set("Analyzing data...") +# self.root.update() - # Create comparator and analyze - self.comparator = KSTCoordiComparator(file_path) - if not self.comparator.load_data(): - messagebox.showerror("Error", "Failed to load Excel data") - return +# # Create comparator and analyze +# self.comparator = KSTCoordiComparator(file_path) +# if not self.comparator.load_data(): +# messagebox.showerror("Error", "Failed to load Excel data") +# return - # Get comparison results - self.comparison_data = self.comparator.get_comparison_summary() +# # Get comparison results +# self.comparison_data = self.comparator.get_comparison_summary() - # Update GUI - self.update_summary() - self.update_data_tables() +# # Update GUI +# self.update_summary() +# self.update_data_tables() - self.status_var.set("Analysis complete!") +# self.status_var.set("Analysis complete!") - except Exception as e: - messagebox.showerror("Error", f"Analysis failed: {str(e)}") - self.status_var.set("Analysis failed") +# except Exception as e: +# messagebox.showerror("Error", f"Analysis failed: {str(e)}") +# self.status_var.set("Analysis failed") - def update_summary(self): - if not self.comparison_data: - return +# def update_summary(self): +# if not self.comparison_data: +# return - # Clear previous content - self.summary_text.delete(1.0, tk.END) - self.reconcile_text.delete(1.0, tk.END) +# # Clear previous content +# self.summary_text.delete(1.0, tk.END) +# self.reconcile_text.delete(1.0, tk.END) - data = self.comparison_data +# data = self.comparison_data - # Summary text - summary = f"""COMPARISON SUMMARY -{'='*50} +# # Summary text +# summary = f"""COMPARISON SUMMARY +# {'='*50} -Original Counts: - KST Total: {data['original_counts']['kst_total']:,} - Coordi Total: {data['original_counts']['coordi_total']:,} +# Original Counts: +# KST Total: {data['original_counts']['kst_total']:,} +# Coordi Total: {data['original_counts']['coordi_total']:,} -Matched Items: {data['matched_items_count']:,} +# Matched Items: {data['matched_items_count']:,} -Mismatches: - KST Only: {data['mismatches']['kst_only_count']:,} - Coordi Only: {data['mismatches']['coordi_only_count']:,} - KST Duplicates: {data['mismatches']['kst_duplicates_count']:,} - Coordi Duplicates: {data['mismatches']['coordi_duplicates_count']:,} +# Mismatches: +# KST Only: {data['mismatches']['kst_only_count']:,} +# Coordi Only: {data['mismatches']['coordi_only_count']:,} +# KST Duplicates: {data['mismatches']['kst_duplicates_count']:,} +# Coordi Duplicates: {data['mismatches']['coordi_duplicates_count']:,} -Total Mismatches: {data['mismatches']['kst_only_count'] + data['mismatches']['coordi_only_count'] + data['mismatches']['kst_duplicates_count'] + data['mismatches']['coordi_duplicates_count']:,} -""" +# Total Mismatches: {data['mismatches']['kst_only_count'] + data['mismatches']['coordi_only_count'] + data['mismatches']['kst_duplicates_count'] + data['mismatches']['coordi_duplicates_count']:,} +# """ - self.summary_text.insert(tk.END, summary) +# self.summary_text.insert(tk.END, summary) - # Reconciliation text - reconcile = data['reconciliation'] - reconcile_info = f"""RECONCILIATION RESULTS -{'='*40} +# # Reconciliation text +# reconcile = data['reconciliation'] +# reconcile_info = f"""RECONCILIATION RESULTS +# {'='*40} -After excluding mismatches: - KST Count: {reconcile['reconciled_kst_count']:,} - Coordi Count: {reconcile['reconciled_coordi_count']:,} - Counts Match: {'✅ YES' if reconcile['counts_match_after_reconciliation'] else '❌ NO'} +# After excluding mismatches: +# KST Count: {reconcile['reconciled_kst_count']:,} +# Coordi Count: {reconcile['reconciled_coordi_count']:,} +# Counts Match: {'✅ YES' if reconcile['counts_match_after_reconciliation'] else '❌ NO'} -Items to exclude: - From KST: {reconcile['items_to_exclude_from_kst']:,} - From Coordi: {reconcile['items_to_exclude_from_coordi']:,} +# Items to exclude: +# From KST: {reconcile['items_to_exclude_from_kst']:,} +# From Coordi: {reconcile['items_to_exclude_from_coordi']:,} -Final Result: Both datasets will have {reconcile['reconciled_kst_count']:,} matching items after reconciliation. -""" +# Final Result: Both datasets will have {reconcile['reconciled_kst_count']:,} matching items after reconciliation. +# """ - self.reconcile_text.insert(tk.END, reconcile_info) +# self.reconcile_text.insert(tk.END, reconcile_info) - def update_data_tables(self): - if not self.comparison_data: - return +# def update_data_tables(self): +# if not self.comparison_data: +# return - mismatches = self.comparison_data['mismatch_details'] +# mismatches = self.comparison_data['mismatch_details'] - # Update matched items (create from intersection) - matched_count = self.comparison_data['matched_items_count'] - self.matched_count_var.set(f"{matched_count:,}") +# # Update matched items (create from intersection) +# matched_count = self.comparison_data['matched_items_count'] +# self.matched_count_var.set(f"{matched_count:,}") - # Clear matched tree - for item in self.matched_tree.get_children(): - self.matched_tree.delete(item) +# # Clear matched tree +# for item in self.matched_tree.get_children(): +# self.matched_tree.delete(item) - # Add matched items (we'll show the first few as examples) - if self.comparator: - categorization = self.comparator.categorize_mismatches() - matched_items = categorization['matched_items'] - for i, item in enumerate(list(matched_items)[:100]): # Show first 100 - self.matched_tree.insert("", tk.END, values=( - item.title, item.episode, item.source_sheet, item.row_index + 1, "Perfect match" - )) +# # Add matched items (we'll show the first few as examples) +# if self.comparator: +# categorization = self.comparator.categorize_mismatches() +# matched_items = categorization['matched_items'] +# for i, item in enumerate(list(matched_items)[:100]): # Show first 100 +# self.matched_tree.insert("", tk.END, values=( +# item.title, item.episode, item.source_sheet, item.row_index + 1, "Perfect match" +# )) - # Update KST only - kst_only = mismatches['kst_only'] - self.kst_only_count_var.set(f"{len(kst_only):,}") +# # Update KST only +# kst_only = mismatches['kst_only'] +# self.kst_only_count_var.set(f"{len(kst_only):,}") - for item in self.kst_only_tree.get_children(): - self.kst_only_tree.delete(item) +# for item in self.kst_only_tree.get_children(): +# self.kst_only_tree.delete(item) - for mismatch in kst_only: - self.kst_only_tree.insert("", tk.END, values=( - mismatch['title'], mismatch['episode'], mismatch['sheet'], - mismatch['row_index'] + 1, mismatch['reason'] - )) +# for mismatch in kst_only: +# self.kst_only_tree.insert("", tk.END, values=( +# mismatch['title'], mismatch['episode'], mismatch['sheet'], +# mismatch['row_index'] + 1, mismatch['reason'] +# )) - # Update Coordi only - coordi_only = mismatches['coordi_only'] - self.coordi_only_count_var.set(f"{len(coordi_only):,}") +# # Update Coordi only +# coordi_only = mismatches['coordi_only'] +# self.coordi_only_count_var.set(f"{len(coordi_only):,}") - for item in self.coordi_only_tree.get_children(): - self.coordi_only_tree.delete(item) +# for item in self.coordi_only_tree.get_children(): +# self.coordi_only_tree.delete(item) - for mismatch in coordi_only: - self.coordi_only_tree.insert("", tk.END, values=( - mismatch['title'], mismatch['episode'], mismatch['sheet'], - mismatch['row_index'] + 1, mismatch['reason'] - )) +# for mismatch in coordi_only: +# self.coordi_only_tree.insert("", tk.END, values=( +# mismatch['title'], mismatch['episode'], mismatch['sheet'], +# mismatch['row_index'] + 1, mismatch['reason'] +# )) -def main(): - root = tk.Tk() - app = DataComparisonGUI(root) - root.mainloop() +# def main(): +# root = tk.Tk() +# app = DataComparisonGUI(root) +# root.mainloop() -if __name__ == "__main__": - main() \ No newline at end of file +# if __name__ == "__main__": +# main() \ No newline at end of file diff --git a/templates/index.html b/templates/index.html index f916ec0..4b99942 100644 --- a/templates/index.html +++ b/templates/index.html @@ -404,6 +404,7 @@ // Update count displays document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString(); + // Count all different items including duplicates const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count + results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count; document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString(); @@ -443,50 +444,44 @@ const tbody = document.getElementById('different-table'); tbody.innerHTML = ''; - // Combine all mismatches into one array for sorting + // Create sets of duplicate items for highlighting + const kstDuplicateKeys = new Set(); + const coordiDuplicateKeys = new Set(); + + mismatchDetails.kst_duplicates.forEach(item => { + kstDuplicateKeys.add(`${item.title}_${item.episode}`); + }); + + mismatchDetails.coordi_duplicates.forEach(item => { + coordiDuplicateKeys.add(`${item.title}_${item.episode}`); + }); + + // Combine only KST-only and Coordi-only items (like before) const allDifferences = []; // Add KST-only items mismatchDetails.kst_only.forEach(item => { + const key = `${item.title}_${item.episode}`; allDifferences.push({ kstData: `${item.title} - Episode ${item.episode}`, coordiData: '', reason: 'Only appears in KST', sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 + sortEpisode: parseFloat(item.episode) || 0, + isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate }); }); // Add Coordi-only items mismatchDetails.coordi_only.forEach(item => { + const key = `${item.title}_${item.episode}`; allDifferences.push({ kstData: '', coordiData: `${item.title} - Episode ${item.episode}`, reason: 'Only appears in Coordi', sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 - }); - }); - - // Add KST duplicates - mismatchDetails.kst_duplicates.forEach(item => { - allDifferences.push({ - kstData: `${item.title} - Episode ${item.episode}`, - coordiData: '', - reason: 'Duplicate in KST', - sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 - }); - }); - - // Add Coordi duplicates - mismatchDetails.coordi_duplicates.forEach(item => { - allDifferences.push({ - kstData: '', - coordiData: `${item.title} - Episode ${item.episode}`, - reason: 'Duplicate in Coordi', - sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 + sortEpisode: parseFloat(item.episode) || 0, + isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate }); }); @@ -497,12 +492,18 @@ return a.sortEpisode - b.sortEpisode; }); - // Populate table + // Populate table with highlighting allDifferences.forEach(diff => { const row = tbody.insertRow(); row.insertCell(0).textContent = diff.kstData; row.insertCell(1).textContent = diff.coordiData; row.insertCell(2).textContent = diff.reason; + + // Highlight row in yellow if it's also a duplicate + if (diff.isDuplicate) { + row.style.backgroundColor = '#fff3cd'; // Light yellow + row.title = 'This item also has duplicates in the dataset'; + } }); } diff --git a/test_duplicates.py b/test_duplicates.py new file mode 100644 index 0000000..d239999 --- /dev/null +++ b/test_duplicates.py @@ -0,0 +1,64 @@ +from data_comparator import KSTCoordiComparator + +def test_duplicate_detection(): + comparator = KSTCoordiComparator('data/sample-data.xlsx') + if comparator.load_data(): + print("=== DUPLICATE DETECTION TEST ===") + + # Get the data extraction results + data = comparator.extract_kst_coordi_items() + + print(f"Total KST items (unique): {len(data['kst_items'])}") + print(f"Total KST items (all): {len(data['kst_all_items'])}") + print(f"Total Coordi items (unique): {len(data['coordi_items'])}") + print(f"Total Coordi items (all): {len(data['coordi_all_items'])}") + + # Check for duplicates + categorization = comparator.categorize_mismatches() + + print(f"\nKST duplicates found: {len(categorization['kst_duplicates'])}") + print(f"Coordi duplicates found: {len(categorization['coordi_duplicates'])}") + + # Show sample duplicates + if categorization['kst_duplicates']: + print("\nSample KST duplicates:") + for i, dup in enumerate(categorization['kst_duplicates'][:3]): + print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})") + + if categorization['coordi_duplicates']: + print("\nSample Coordi duplicates:") + for i, dup in enumerate(categorization['coordi_duplicates'][:3]): + print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})") + + # Check for the specific example: 백라이트 - Episode 53-1x(휴재) + mismatch_details = comparator.generate_mismatch_details() + + print(f"\nLooking for '백라이트 - Episode 53-1x(휴재)':") + + # Check in KST-only + backlight_kst_only = [item for item in mismatch_details['kst_only'] + if '백라이트' in item['title'] and '53-1x' in item['episode']] + + # Check in KST duplicates + backlight_kst_dup = [item for item in mismatch_details['kst_duplicates'] + if '백라이트' in item['title'] and '53-1x' in item['episode']] + + print(f" Found in KST-only: {len(backlight_kst_only)}") + print(f" Found in KST duplicates: {len(backlight_kst_dup)}") + + if backlight_kst_only: + print(f" KST-only details: {backlight_kst_only[0]}") + if backlight_kst_dup: + print(f" KST duplicate details: {backlight_kst_dup[0]}") + + # Test the web interface logic + print(f"\n=== Testing Web Interface Logic ===") + summary = comparator.get_comparison_summary() + print(f"Web interface will show:") + print(f" Total different items: {summary['mismatches']['kst_only_count'] + summary['mismatches']['coordi_only_count'] + summary['mismatches']['kst_duplicates_count'] + summary['mismatches']['coordi_duplicates_count']}") + + print("\n✓ Duplicate detection test complete!") + print("✓ Check the web interface at http://localhost:8080 to see combined reasons") + +if __name__ == "__main__": + test_duplicate_detection() \ No newline at end of file diff --git a/test_final_duplicate_fix.py b/test_final_duplicate_fix.py new file mode 100644 index 0000000..52d02f9 --- /dev/null +++ b/test_final_duplicate_fix.py @@ -0,0 +1,52 @@ +import requests + +def test_final_duplicate_fix(): + print("=== FINAL DUPLICATE FIX TEST ===") + + try: + # Test the analyze endpoint + response = requests.post('http://localhost:8081/analyze', + json={'file_path': 'data/sample-data.xlsx'}, + timeout=30) + + if response.status_code == 200: + data = response.json() + if data.get('success'): + results = data['results'] + + print("✓ Analysis successful!") + print(f" Matched items: {results['matched_items_count']}") + print(f" KST only: {results['mismatches']['kst_only_count']}") + print(f" Coordi only: {results['mismatches']['coordi_only_count']}") + print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}") + print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}") + + total_different = (results['mismatches']['kst_only_count'] + + results['mismatches']['coordi_only_count'] + + results['mismatches']['kst_duplicates_count'] + + results['mismatches']['coordi_duplicates_count']) + print(f" Total different items: {total_different}") + + # Check for the specific example + kst_duplicates = results['mismatch_details']['kst_duplicates'] + backlight_duplicates = [item for item in kst_duplicates + if '백라이트' in item['title'] and '53-1x' in item['episode']] + + if backlight_duplicates: + print(f"\n✓ Found 백라이트 duplicates: {len(backlight_duplicates)}") + print(f" Example: {backlight_duplicates[0]['title']} - Episode {backlight_duplicates[0]['episode']}") + + print(f"\n✓ Web interface ready at http://localhost:8081") + print("✓ The 'Different' tab will now show combined reasons like:") + print(" 백라이트 - Episode 53-1x(휴재) | (empty) | Only appears in KST + Duplicate in KST") + + else: + print(f"✗ Analysis failed: {data.get('error')}") + else: + print(f"✗ Request failed: {response.status_code}") + + except requests.exceptions.RequestException as e: + print(f"✗ Request failed: {e}") + +if __name__ == "__main__": + test_final_duplicate_fix() \ No newline at end of file diff --git a/test_simplified_duplicates.py b/test_simplified_duplicates.py new file mode 100644 index 0000000..08aa534 --- /dev/null +++ b/test_simplified_duplicates.py @@ -0,0 +1,68 @@ +import requests + +def test_simplified_duplicates(): + print("=== SIMPLIFIED DUPLICATE DISPLAY TEST ===") + + try: + # Test the analyze endpoint + response = requests.post('http://localhost:8081/analyze', + json={'file_path': 'data/sample-data.xlsx'}, + timeout=30) + + if response.status_code == 200: + data = response.json() + if data.get('success'): + results = data['results'] + + print("✓ Analysis successful!") + print(f" Matched items: {results['matched_items_count']}") + print(f" KST only: {results['mismatches']['kst_only_count']}") + print(f" Coordi only: {results['mismatches']['coordi_only_count']}") + print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}") + print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}") + + # What the count will show + total_count = (results['mismatches']['kst_only_count'] + + results['mismatches']['coordi_only_count'] + + results['mismatches']['kst_duplicates_count'] + + results['mismatches']['coordi_duplicates_count']) + + # What the table will show + table_rows = results['mismatches']['kst_only_count'] + results['mismatches']['coordi_only_count'] + + print(f"\n📊 DISPLAY LOGIC:") + print(f" Count badge shows: {total_count} items (all different items)") + print(f" Table shows: {table_rows} rows (only KST-only + Coordi-only)") + print(f" Yellow highlights: Items that are also duplicates") + + # Check for 백라이트 example + kst_only = results['mismatch_details']['kst_only'] + kst_duplicates = results['mismatch_details']['kst_duplicates'] + + backlight_kst_only = [item for item in kst_only + if '백라이트' in item['title'] and '53-1x' in item['episode']] + backlight_kst_dup = [item for item in kst_duplicates + if '백라이트' in item['title'] and '53-1x' in item['episode']] + + if backlight_kst_only and backlight_kst_dup: + print(f"\n✓ 백라이트 example works:") + print(f" - Appears in table (KST-only): YES") + print(f" - Will be highlighted yellow: YES (also duplicate)") + print(f" - Contributes to count: 2 items (1 KST-only + 1 duplicate)") + + print(f"\n✓ Web interface ready at http://localhost:8081") + print("✓ Check the 'Different' tab:") + print(" - Count shows all different items") + print(" - Table shows only KST-only + Coordi-only") + print(" - Yellow rows = items that also have duplicates") + + else: + print(f"✗ Analysis failed: {data.get('error')}") + else: + print(f"✗ Request failed: {response.status_code}") + + except requests.exceptions.RequestException as e: + print(f"✗ Request failed: {e}") + +if __name__ == "__main__": + test_simplified_duplicates() \ No newline at end of file diff --git a/web_gui.py b/web_gui.py index eabea27..9783fa4 100644 --- a/web_gui.py +++ b/web_gui.py @@ -512,6 +512,7 @@ def create_templates_dir(): // Update count displays document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString(); + // Count all different items including duplicates const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count + results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count; document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString(); @@ -551,50 +552,44 @@ def create_templates_dir(): const tbody = document.getElementById('different-table'); tbody.innerHTML = ''; - // Combine all mismatches into one array for sorting + // Create sets of duplicate items for highlighting + const kstDuplicateKeys = new Set(); + const coordiDuplicateKeys = new Set(); + + mismatchDetails.kst_duplicates.forEach(item => { + kstDuplicateKeys.add(`${item.title}_${item.episode}`); + }); + + mismatchDetails.coordi_duplicates.forEach(item => { + coordiDuplicateKeys.add(`${item.title}_${item.episode}`); + }); + + // Combine only KST-only and Coordi-only items (like before) const allDifferences = []; // Add KST-only items mismatchDetails.kst_only.forEach(item => { + const key = `${item.title}_${item.episode}`; allDifferences.push({ kstData: `${item.title} - Episode ${item.episode}`, coordiData: '', reason: 'Only appears in KST', sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 + sortEpisode: parseFloat(item.episode) || 0, + isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate }); }); // Add Coordi-only items mismatchDetails.coordi_only.forEach(item => { + const key = `${item.title}_${item.episode}`; allDifferences.push({ kstData: '', coordiData: `${item.title} - Episode ${item.episode}`, reason: 'Only appears in Coordi', sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 - }); - }); - - // Add KST duplicates - mismatchDetails.kst_duplicates.forEach(item => { - allDifferences.push({ - kstData: `${item.title} - Episode ${item.episode}`, - coordiData: '', - reason: 'Duplicate in KST', - sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 - }); - }); - - // Add Coordi duplicates - mismatchDetails.coordi_duplicates.forEach(item => { - allDifferences.push({ - kstData: '', - coordiData: `${item.title} - Episode ${item.episode}`, - reason: 'Duplicate in Coordi', - sortTitle: item.title, - sortEpisode: parseFloat(item.episode) || 0 + sortEpisode: parseFloat(item.episode) || 0, + isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate }); }); @@ -605,12 +600,18 @@ def create_templates_dir(): return a.sortEpisode - b.sortEpisode; }); - // Populate table + // Populate table with highlighting allDifferences.forEach(diff => { const row = tbody.insertRow(); row.insertCell(0).textContent = diff.kstData; row.insertCell(1).textContent = diff.coordiData; row.insertCell(2).textContent = diff.reason; + + // Highlight row in yellow if it's also a duplicate + if (diff.isDuplicate) { + row.style.backgroundColor = '#fff3cd'; // Light yellow + row.title = 'This item also has duplicates in the dataset'; + } }); } @@ -630,8 +631,8 @@ def main(): create_templates_dir() print("Starting web-based GUI...") - print("Open your browser and go to: http://localhost:8080") - app.run(debug=True, host='0.0.0.0', port=8080) + print("Open your browser and go to: http://localhost:8081") + app.run(debug=True, host='0.0.0.0', port=8081) if __name__ == "__main__": main() \ No newline at end of file