fix logic
This commit is contained in:
parent
1f88db5fb9
commit
ed3655d1c9
@ -212,6 +212,59 @@ class KSTCoordiComparator:
|
|||||||
|
|
||||||
return duplicates
|
return duplicates
|
||||||
|
|
||||||
|
def _find_sheet_specific_mixed_duplicates(self, sheet_filter: str) -> List[Dict]:
|
||||||
|
"""Find mixed duplicates within a specific sheet only"""
|
||||||
|
if not sheet_filter:
|
||||||
|
return []
|
||||||
|
|
||||||
|
mixed_duplicates = []
|
||||||
|
|
||||||
|
# Extract items specific to this sheet
|
||||||
|
extract_results = self.extract_kst_coordi_items()
|
||||||
|
kst_sheet_items = [item for item in extract_results['kst_all_items'] if item.source_sheet == sheet_filter]
|
||||||
|
coordi_sheet_items = [item for item in extract_results['coordi_all_items'] if item.source_sheet == sheet_filter]
|
||||||
|
|
||||||
|
# Find duplicates within this sheet
|
||||||
|
kst_sheet_duplicates = self._find_duplicates_in_list(kst_sheet_items)
|
||||||
|
coordi_sheet_duplicates = self._find_duplicates_in_list(coordi_sheet_items)
|
||||||
|
|
||||||
|
# Create sets for items that exist in both KST and Coordi within this sheet
|
||||||
|
kst_sheet_set = {(item.title, item.episode) for item in kst_sheet_items}
|
||||||
|
coordi_sheet_set = {(item.title, item.episode) for item in coordi_sheet_items}
|
||||||
|
matched_in_sheet = kst_sheet_set.intersection(coordi_sheet_set)
|
||||||
|
|
||||||
|
# Create sets of duplicate keys within this sheet
|
||||||
|
kst_duplicate_keys = {(item.title, item.episode) for item in kst_sheet_duplicates}
|
||||||
|
coordi_duplicate_keys = {(item.title, item.episode) for item in coordi_sheet_duplicates}
|
||||||
|
|
||||||
|
# Find matched items that also have duplicates within the same sheet
|
||||||
|
for title, episode in matched_in_sheet:
|
||||||
|
# Check if this matched item has duplicates in KST within this sheet
|
||||||
|
if (title, episode) in kst_duplicate_keys:
|
||||||
|
mixed_duplicates.append({
|
||||||
|
'title': title,
|
||||||
|
'episode': episode,
|
||||||
|
'sheet': sheet_filter,
|
||||||
|
'row_index': None, # Could get from items if needed
|
||||||
|
'reason': f'Item exists in both datasets but has duplicates in KST within {sheet_filter}',
|
||||||
|
'mismatch_type': 'MIXED_DUPLICATE_KST',
|
||||||
|
'duplicate_side': 'KST'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check if this matched item has duplicates in Coordi within this sheet
|
||||||
|
if (title, episode) in coordi_duplicate_keys:
|
||||||
|
mixed_duplicates.append({
|
||||||
|
'title': title,
|
||||||
|
'episode': episode,
|
||||||
|
'sheet': sheet_filter,
|
||||||
|
'row_index': None, # Could get from items if needed
|
||||||
|
'reason': f'Item exists in both datasets but has duplicates in Coordi within {sheet_filter}',
|
||||||
|
'mismatch_type': 'MIXED_DUPLICATE_COORDI',
|
||||||
|
'duplicate_side': 'COORDI'
|
||||||
|
})
|
||||||
|
|
||||||
|
return mixed_duplicates
|
||||||
|
|
||||||
def generate_mismatch_details(self) -> Dict[str, List[Dict]]:
|
def generate_mismatch_details(self) -> Dict[str, List[Dict]]:
|
||||||
"""Generate detailed information about each type of mismatch with reasons"""
|
"""Generate detailed information about each type of mismatch with reasons"""
|
||||||
categorization = self.categorize_mismatches()
|
categorization = self.categorize_mismatches()
|
||||||
@ -220,7 +273,8 @@ class KSTCoordiComparator:
|
|||||||
'kst_only': [],
|
'kst_only': [],
|
||||||
'coordi_only': [],
|
'coordi_only': [],
|
||||||
'kst_duplicates': [],
|
'kst_duplicates': [],
|
||||||
'coordi_duplicates': []
|
'coordi_duplicates': [],
|
||||||
|
'mixed_duplicates': []
|
||||||
}
|
}
|
||||||
|
|
||||||
# KST-only items
|
# KST-only items
|
||||||
@ -267,38 +321,40 @@ class KSTCoordiComparator:
|
|||||||
'mismatch_type': 'COORDI_DUPLICATE'
|
'mismatch_type': 'COORDI_DUPLICATE'
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Mixed duplicates will be calculated per sheet in get_comparison_summary
|
||||||
|
mismatch_details['mixed_duplicates'] = []
|
||||||
|
|
||||||
return mismatch_details
|
return mismatch_details
|
||||||
|
|
||||||
def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]:
|
def get_comparison_summary(self, sheet_filter: str = None) -> Dict[str, Any]:
|
||||||
"""Get a comprehensive summary of the comparison, optionally filtered by sheet"""
|
"""Get a comprehensive summary of the comparison, filtered by a specific sheet"""
|
||||||
|
# Get sheet names for filtering options
|
||||||
|
sheet_names = list(self.data.keys()) if self.data else []
|
||||||
|
|
||||||
|
# If no sheet filter provided, default to first sheet
|
||||||
|
if not sheet_filter:
|
||||||
|
sheet_filter = sheet_names[0] if sheet_names else None
|
||||||
|
|
||||||
|
if not sheet_filter:
|
||||||
|
raise ValueError("No sheets available or sheet filter not specified")
|
||||||
|
|
||||||
categorization = self.categorize_mismatches()
|
categorization = self.categorize_mismatches()
|
||||||
mismatch_details = self.generate_mismatch_details()
|
mismatch_details = self.generate_mismatch_details()
|
||||||
grouped_data = self.group_by_title()
|
grouped_data = self.group_by_title()
|
||||||
|
|
||||||
# Get sheet names for filtering options
|
# Always apply sheet filtering (no more "All Sheets" option)
|
||||||
sheet_names = list(self.data.keys()) if self.data else []
|
|
||||||
|
|
||||||
# Apply sheet filtering if specified
|
|
||||||
if sheet_filter and sheet_filter != 'All Sheets':
|
|
||||||
mismatch_details = self.filter_by_sheet(mismatch_details, sheet_filter)
|
mismatch_details = self.filter_by_sheet(mismatch_details, sheet_filter)
|
||||||
grouped_data = self.filter_grouped_data_by_sheet(grouped_data, sheet_filter)
|
grouped_data = self.filter_grouped_data_by_sheet(grouped_data, sheet_filter)
|
||||||
|
|
||||||
|
# Calculate mixed duplicates specific to this sheet
|
||||||
|
mismatch_details['mixed_duplicates'] = self._find_sheet_specific_mixed_duplicates(sheet_filter)
|
||||||
|
|
||||||
# Recalculate counts for filtered data
|
# Recalculate counts for filtered data
|
||||||
filtered_counts = self.calculate_filtered_counts(mismatch_details)
|
filtered_counts = self.calculate_filtered_counts(mismatch_details)
|
||||||
else:
|
|
||||||
filtered_counts = {
|
|
||||||
'kst_total': categorization['counts']['total_kst'],
|
|
||||||
'coordi_total': categorization['counts']['total_coordi'],
|
|
||||||
'matched': categorization['counts']['matched'],
|
|
||||||
'kst_only_count': categorization['counts']['kst_only'],
|
|
||||||
'coordi_only_count': categorization['counts']['coordi_only'],
|
|
||||||
'kst_duplicates_count': categorization['counts']['kst_duplicates_count'],
|
|
||||||
'coordi_duplicates_count': categorization['counts']['coordi_duplicates_count']
|
|
||||||
}
|
|
||||||
|
|
||||||
summary = {
|
summary = {
|
||||||
'sheet_names': sheet_names,
|
'sheet_names': sheet_names,
|
||||||
'current_sheet_filter': sheet_filter or 'All Sheets',
|
'current_sheet_filter': sheet_filter,
|
||||||
'original_counts': {
|
'original_counts': {
|
||||||
'kst_total': filtered_counts['kst_total'],
|
'kst_total': filtered_counts['kst_total'],
|
||||||
'coordi_total': filtered_counts['coordi_total']
|
'coordi_total': filtered_counts['coordi_total']
|
||||||
@ -372,7 +428,8 @@ class KSTCoordiComparator:
|
|||||||
'kst_only_count': len(filtered_mismatch_details['kst_only']),
|
'kst_only_count': len(filtered_mismatch_details['kst_only']),
|
||||||
'coordi_only_count': len(filtered_mismatch_details['coordi_only']),
|
'coordi_only_count': len(filtered_mismatch_details['coordi_only']),
|
||||||
'kst_duplicates_count': len(filtered_mismatch_details['kst_duplicates']),
|
'kst_duplicates_count': len(filtered_mismatch_details['kst_duplicates']),
|
||||||
'coordi_duplicates_count': len(filtered_mismatch_details['coordi_duplicates'])
|
'coordi_duplicates_count': len(filtered_mismatch_details['coordi_duplicates']),
|
||||||
|
'mixed_duplicates_count': len(filtered_mismatch_details.get('mixed_duplicates', []))
|
||||||
}
|
}
|
||||||
|
|
||||||
def group_by_title(self) -> Dict[str, Any]:
|
def group_by_title(self) -> Dict[str, Any]:
|
||||||
|
|||||||
@ -183,7 +183,7 @@
|
|||||||
<div class="file-input" style="margin-top: 10px;">
|
<div class="file-input" style="margin-top: 10px;">
|
||||||
<label for="sheetFilter">Sheet Filter:</label>
|
<label for="sheetFilter">Sheet Filter:</label>
|
||||||
<select id="sheetFilter" onchange="filterBySheet()" disabled>
|
<select id="sheetFilter" onchange="filterBySheet()" disabled>
|
||||||
<option value="All Sheets">All Sheets</option>
|
<!-- Options will be populated dynamically -->
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div id="status"></div>
|
<div id="status"></div>
|
||||||
@ -275,7 +275,7 @@
|
|||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
file_path: filePath,
|
file_path: filePath,
|
||||||
sheet_filter: sheetFilter === 'All Sheets' ? null : sheetFilter
|
sheet_filter: sheetFilter
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
@ -300,13 +300,14 @@
|
|||||||
|
|
||||||
function updateSheetFilter(sheetNames, currentFilter) {
|
function updateSheetFilter(sheetNames, currentFilter) {
|
||||||
const select = document.getElementById('sheetFilter');
|
const select = document.getElementById('sheetFilter');
|
||||||
select.innerHTML = '<option value="All Sheets">All Sheets</option>';
|
select.innerHTML = '';
|
||||||
|
|
||||||
sheetNames.forEach(sheetName => {
|
sheetNames.forEach((sheetName, index) => {
|
||||||
const option = document.createElement('option');
|
const option = document.createElement('option');
|
||||||
option.value = sheetName;
|
option.value = sheetName;
|
||||||
option.textContent = sheetName;
|
option.textContent = sheetName;
|
||||||
if (sheetName === currentFilter) {
|
// Select the first sheet by default, or the current filter if specified
|
||||||
|
if (sheetName === currentFilter || (!currentFilter && index === 0)) {
|
||||||
option.selected = true;
|
option.selected = true;
|
||||||
}
|
}
|
||||||
select.appendChild(option);
|
select.appendChild(option);
|
||||||
@ -372,7 +373,7 @@
|
|||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
file_path: data.file_path,
|
file_path: data.file_path,
|
||||||
sheet_filter: sheetFilter === 'All Sheets' ? null : sheetFilter
|
sheet_filter: sheetFilter
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@ -404,9 +405,10 @@
|
|||||||
// Update count displays
|
// Update count displays
|
||||||
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
|
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
|
||||||
|
|
||||||
// Count all different items including duplicates
|
// Count all different items including duplicates and mixed duplicates
|
||||||
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
|
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
|
||||||
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count;
|
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count +
|
||||||
|
(results.mismatches.mixed_duplicates_count || 0);
|
||||||
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
|
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
|
||||||
|
|
||||||
// Update Summary tab (matched items)
|
// Update Summary tab (matched items)
|
||||||
@ -444,47 +446,70 @@
|
|||||||
const tbody = document.getElementById('different-table');
|
const tbody = document.getElementById('different-table');
|
||||||
tbody.innerHTML = '';
|
tbody.innerHTML = '';
|
||||||
|
|
||||||
// Create sets of duplicate items for highlighting
|
|
||||||
const kstDuplicateKeys = new Set();
|
|
||||||
const coordiDuplicateKeys = new Set();
|
|
||||||
|
|
||||||
mismatchDetails.kst_duplicates.forEach(item => {
|
|
||||||
kstDuplicateKeys.add(`${item.title}_${item.episode}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
mismatchDetails.coordi_duplicates.forEach(item => {
|
|
||||||
coordiDuplicateKeys.add(`${item.title}_${item.episode}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Combine only KST-only and Coordi-only items (like before)
|
|
||||||
const allDifferences = [];
|
const allDifferences = [];
|
||||||
|
|
||||||
// Add KST-only items
|
// Add KST-only items (no special highlighting)
|
||||||
mismatchDetails.kst_only.forEach(item => {
|
mismatchDetails.kst_only.forEach(item => {
|
||||||
const key = `${item.title}_${item.episode}`;
|
|
||||||
allDifferences.push({
|
allDifferences.push({
|
||||||
kstData: `${item.title} - Episode ${item.episode}`,
|
kstData: `${item.title} - Episode ${item.episode}`,
|
||||||
coordiData: '',
|
coordiData: '',
|
||||||
reason: 'Only appears in KST',
|
reason: 'Only appears in KST',
|
||||||
sortTitle: item.title,
|
sortTitle: item.title,
|
||||||
sortEpisode: parseFloat(item.episode) || 0,
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate
|
highlightType: 'none'
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add Coordi-only items
|
// Add Coordi-only items (no special highlighting)
|
||||||
mismatchDetails.coordi_only.forEach(item => {
|
mismatchDetails.coordi_only.forEach(item => {
|
||||||
const key = `${item.title}_${item.episode}`;
|
|
||||||
allDifferences.push({
|
allDifferences.push({
|
||||||
kstData: '',
|
kstData: '',
|
||||||
coordiData: `${item.title} - Episode ${item.episode}`,
|
coordiData: `${item.title} - Episode ${item.episode}`,
|
||||||
reason: 'Only appears in Coordi',
|
reason: 'Only appears in Coordi',
|
||||||
sortTitle: item.title,
|
sortTitle: item.title,
|
||||||
sortEpisode: parseFloat(item.episode) || 0,
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate
|
highlightType: 'none'
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Add KST duplicates (red highlighting)
|
||||||
|
mismatchDetails.kst_duplicates.forEach(item => {
|
||||||
|
allDifferences.push({
|
||||||
|
kstData: `${item.title} - Episode ${item.episode}`,
|
||||||
|
coordiData: '',
|
||||||
|
reason: 'Duplicate entry in KST data',
|
||||||
|
sortTitle: item.title,
|
||||||
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
|
highlightType: 'red'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add Coordi duplicates (red highlighting)
|
||||||
|
mismatchDetails.coordi_duplicates.forEach(item => {
|
||||||
|
allDifferences.push({
|
||||||
|
kstData: '',
|
||||||
|
coordiData: `${item.title} - Episode ${item.episode}`,
|
||||||
|
reason: 'Duplicate entry in Coordi data',
|
||||||
|
sortTitle: item.title,
|
||||||
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
|
highlightType: 'red'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add mixed duplicates (yellow highlighting)
|
||||||
|
if (mismatchDetails.mixed_duplicates) {
|
||||||
|
mismatchDetails.mixed_duplicates.forEach(item => {
|
||||||
|
allDifferences.push({
|
||||||
|
kstData: item.duplicate_side === 'KST' ? `${item.title} - Episode ${item.episode}` : `${item.title} - Episode ${item.episode}`,
|
||||||
|
coordiData: item.duplicate_side === 'COORDI' ? `${item.title} - Episode ${item.episode}` : `${item.title} - Episode ${item.episode}`,
|
||||||
|
reason: item.reason,
|
||||||
|
sortTitle: item.title,
|
||||||
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
|
highlightType: 'yellow'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Sort by Korean title + episode
|
// Sort by Korean title + episode
|
||||||
allDifferences.sort((a, b) => {
|
allDifferences.sort((a, b) => {
|
||||||
const titleCompare = a.sortTitle.localeCompare(b.sortTitle, 'ko');
|
const titleCompare = a.sortTitle.localeCompare(b.sortTitle, 'ko');
|
||||||
@ -499,10 +524,13 @@
|
|||||||
row.insertCell(1).textContent = diff.coordiData;
|
row.insertCell(1).textContent = diff.coordiData;
|
||||||
row.insertCell(2).textContent = diff.reason;
|
row.insertCell(2).textContent = diff.reason;
|
||||||
|
|
||||||
// Highlight row in yellow if it's also a duplicate
|
// Apply highlighting based on type
|
||||||
if (diff.isDuplicate) {
|
if (diff.highlightType === 'red') {
|
||||||
|
row.style.backgroundColor = '#f8d7da'; // Light red
|
||||||
|
row.title = 'Pure duplicate entry';
|
||||||
|
} else if (diff.highlightType === 'yellow') {
|
||||||
row.style.backgroundColor = '#fff3cd'; // Light yellow
|
row.style.backgroundColor = '#fff3cd'; // Light yellow
|
||||||
row.title = 'This item also has duplicates in the dataset';
|
row.title = 'Item exists in both datasets but has duplicates on one side';
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,64 +0,0 @@
|
|||||||
from data_comparator import KSTCoordiComparator
|
|
||||||
|
|
||||||
def test_duplicate_detection():
|
|
||||||
comparator = KSTCoordiComparator('data/sample-data.xlsx')
|
|
||||||
if comparator.load_data():
|
|
||||||
print("=== DUPLICATE DETECTION TEST ===")
|
|
||||||
|
|
||||||
# Get the data extraction results
|
|
||||||
data = comparator.extract_kst_coordi_items()
|
|
||||||
|
|
||||||
print(f"Total KST items (unique): {len(data['kst_items'])}")
|
|
||||||
print(f"Total KST items (all): {len(data['kst_all_items'])}")
|
|
||||||
print(f"Total Coordi items (unique): {len(data['coordi_items'])}")
|
|
||||||
print(f"Total Coordi items (all): {len(data['coordi_all_items'])}")
|
|
||||||
|
|
||||||
# Check for duplicates
|
|
||||||
categorization = comparator.categorize_mismatches()
|
|
||||||
|
|
||||||
print(f"\nKST duplicates found: {len(categorization['kst_duplicates'])}")
|
|
||||||
print(f"Coordi duplicates found: {len(categorization['coordi_duplicates'])}")
|
|
||||||
|
|
||||||
# Show sample duplicates
|
|
||||||
if categorization['kst_duplicates']:
|
|
||||||
print("\nSample KST duplicates:")
|
|
||||||
for i, dup in enumerate(categorization['kst_duplicates'][:3]):
|
|
||||||
print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})")
|
|
||||||
|
|
||||||
if categorization['coordi_duplicates']:
|
|
||||||
print("\nSample Coordi duplicates:")
|
|
||||||
for i, dup in enumerate(categorization['coordi_duplicates'][:3]):
|
|
||||||
print(f" {i+1}. {dup.title} - Episode {dup.episode} (Sheet: {dup.source_sheet}, Row: {dup.row_index + 1})")
|
|
||||||
|
|
||||||
# Check for the specific example: 백라이트 - Episode 53-1x(휴재)
|
|
||||||
mismatch_details = comparator.generate_mismatch_details()
|
|
||||||
|
|
||||||
print(f"\nLooking for '백라이트 - Episode 53-1x(휴재)':")
|
|
||||||
|
|
||||||
# Check in KST-only
|
|
||||||
backlight_kst_only = [item for item in mismatch_details['kst_only']
|
|
||||||
if '백라이트' in item['title'] and '53-1x' in item['episode']]
|
|
||||||
|
|
||||||
# Check in KST duplicates
|
|
||||||
backlight_kst_dup = [item for item in mismatch_details['kst_duplicates']
|
|
||||||
if '백라이트' in item['title'] and '53-1x' in item['episode']]
|
|
||||||
|
|
||||||
print(f" Found in KST-only: {len(backlight_kst_only)}")
|
|
||||||
print(f" Found in KST duplicates: {len(backlight_kst_dup)}")
|
|
||||||
|
|
||||||
if backlight_kst_only:
|
|
||||||
print(f" KST-only details: {backlight_kst_only[0]}")
|
|
||||||
if backlight_kst_dup:
|
|
||||||
print(f" KST duplicate details: {backlight_kst_dup[0]}")
|
|
||||||
|
|
||||||
# Test the web interface logic
|
|
||||||
print(f"\n=== Testing Web Interface Logic ===")
|
|
||||||
summary = comparator.get_comparison_summary()
|
|
||||||
print(f"Web interface will show:")
|
|
||||||
print(f" Total different items: {summary['mismatches']['kst_only_count'] + summary['mismatches']['coordi_only_count'] + summary['mismatches']['kst_duplicates_count'] + summary['mismatches']['coordi_duplicates_count']}")
|
|
||||||
|
|
||||||
print("\n✓ Duplicate detection test complete!")
|
|
||||||
print("✓ Check the web interface at http://localhost:8080 to see combined reasons")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_duplicate_detection()
|
|
||||||
@ -1,52 +0,0 @@
|
|||||||
import requests
|
|
||||||
|
|
||||||
def test_final_duplicate_fix():
|
|
||||||
print("=== FINAL DUPLICATE FIX TEST ===")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Test the analyze endpoint
|
|
||||||
response = requests.post('http://localhost:8081/analyze',
|
|
||||||
json={'file_path': 'data/sample-data.xlsx'},
|
|
||||||
timeout=30)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
data = response.json()
|
|
||||||
if data.get('success'):
|
|
||||||
results = data['results']
|
|
||||||
|
|
||||||
print("✓ Analysis successful!")
|
|
||||||
print(f" Matched items: {results['matched_items_count']}")
|
|
||||||
print(f" KST only: {results['mismatches']['kst_only_count']}")
|
|
||||||
print(f" Coordi only: {results['mismatches']['coordi_only_count']}")
|
|
||||||
print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}")
|
|
||||||
print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}")
|
|
||||||
|
|
||||||
total_different = (results['mismatches']['kst_only_count'] +
|
|
||||||
results['mismatches']['coordi_only_count'] +
|
|
||||||
results['mismatches']['kst_duplicates_count'] +
|
|
||||||
results['mismatches']['coordi_duplicates_count'])
|
|
||||||
print(f" Total different items: {total_different}")
|
|
||||||
|
|
||||||
# Check for the specific example
|
|
||||||
kst_duplicates = results['mismatch_details']['kst_duplicates']
|
|
||||||
backlight_duplicates = [item for item in kst_duplicates
|
|
||||||
if '백라이트' in item['title'] and '53-1x' in item['episode']]
|
|
||||||
|
|
||||||
if backlight_duplicates:
|
|
||||||
print(f"\n✓ Found 백라이트 duplicates: {len(backlight_duplicates)}")
|
|
||||||
print(f" Example: {backlight_duplicates[0]['title']} - Episode {backlight_duplicates[0]['episode']}")
|
|
||||||
|
|
||||||
print(f"\n✓ Web interface ready at http://localhost:8081")
|
|
||||||
print("✓ The 'Different' tab will now show combined reasons like:")
|
|
||||||
print(" 백라이트 - Episode 53-1x(휴재) | (empty) | Only appears in KST + Duplicate in KST")
|
|
||||||
|
|
||||||
else:
|
|
||||||
print(f"✗ Analysis failed: {data.get('error')}")
|
|
||||||
else:
|
|
||||||
print(f"✗ Request failed: {response.status_code}")
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
print(f"✗ Request failed: {e}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_final_duplicate_fix()
|
|
||||||
57
test_sheet_filtering.py
Normal file
57
test_sheet_filtering.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from data_comparator import KSTCoordiComparator
|
||||||
|
|
||||||
|
def test_sheet_filtering():
|
||||||
|
"""Test that sheet filtering works correctly and defaults to first sheet"""
|
||||||
|
print("Testing sheet filtering functionality...")
|
||||||
|
|
||||||
|
# Create comparator and load data
|
||||||
|
comparator = KSTCoordiComparator("data/sample-data.xlsx")
|
||||||
|
if not comparator.load_data():
|
||||||
|
print("Failed to load data!")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Available sheets: {list(comparator.data.keys())}")
|
||||||
|
|
||||||
|
# Test 1: No sheet filter provided (should default to first sheet)
|
||||||
|
print("\n=== TEST 1: No sheet filter (should default to first sheet) ===")
|
||||||
|
try:
|
||||||
|
summary1 = comparator.get_comparison_summary()
|
||||||
|
print(f"Default sheet selected: {summary1['current_sheet_filter']}")
|
||||||
|
print(f"KST total: {summary1['original_counts']['kst_total']}")
|
||||||
|
print(f"Coordi total: {summary1['original_counts']['coordi_total']}")
|
||||||
|
print(f"Matched: {summary1['matched_items_count']}")
|
||||||
|
print("✓ Test 1 passed")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Test 1 failed: {e}")
|
||||||
|
|
||||||
|
# Test 2: Specific sheet filter
|
||||||
|
sheet_names = list(comparator.data.keys())
|
||||||
|
if len(sheet_names) > 1:
|
||||||
|
second_sheet = sheet_names[1]
|
||||||
|
print(f"\n=== TEST 2: Specific sheet filter ({second_sheet}) ===")
|
||||||
|
try:
|
||||||
|
summary2 = comparator.get_comparison_summary(second_sheet)
|
||||||
|
print(f"Selected sheet: {summary2['current_sheet_filter']}")
|
||||||
|
print(f"KST total: {summary2['original_counts']['kst_total']}")
|
||||||
|
print(f"Coordi total: {summary2['original_counts']['coordi_total']}")
|
||||||
|
print(f"Matched: {summary2['matched_items_count']}")
|
||||||
|
print("✓ Test 2 passed")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Test 2 failed: {e}")
|
||||||
|
else:
|
||||||
|
print("\n=== TEST 2: Skipped (only one sheet available) ===")
|
||||||
|
|
||||||
|
# Test 3: Verify no duplicates across sheets (this was the original problem)
|
||||||
|
print(f"\n=== TEST 3: Verify duplicate detection within single sheets only ===")
|
||||||
|
for sheet_name in sheet_names:
|
||||||
|
summary = comparator.get_comparison_summary(sheet_name)
|
||||||
|
print(f"Sheet '{sheet_name}':")
|
||||||
|
print(f" KST duplicates: {summary['mismatches']['kst_duplicates_count']}")
|
||||||
|
print(f" Coordi duplicates: {summary['mismatches']['coordi_duplicates_count']}")
|
||||||
|
|
||||||
|
print("\n✓ All tests completed!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_sheet_filtering()
|
||||||
@ -1,68 +0,0 @@
|
|||||||
import requests
|
|
||||||
|
|
||||||
def test_simplified_duplicates():
|
|
||||||
print("=== SIMPLIFIED DUPLICATE DISPLAY TEST ===")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Test the analyze endpoint
|
|
||||||
response = requests.post('http://localhost:8081/analyze',
|
|
||||||
json={'file_path': 'data/sample-data.xlsx'},
|
|
||||||
timeout=30)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
data = response.json()
|
|
||||||
if data.get('success'):
|
|
||||||
results = data['results']
|
|
||||||
|
|
||||||
print("✓ Analysis successful!")
|
|
||||||
print(f" Matched items: {results['matched_items_count']}")
|
|
||||||
print(f" KST only: {results['mismatches']['kst_only_count']}")
|
|
||||||
print(f" Coordi only: {results['mismatches']['coordi_only_count']}")
|
|
||||||
print(f" KST duplicates: {results['mismatches']['kst_duplicates_count']}")
|
|
||||||
print(f" Coordi duplicates: {results['mismatches']['coordi_duplicates_count']}")
|
|
||||||
|
|
||||||
# What the count will show
|
|
||||||
total_count = (results['mismatches']['kst_only_count'] +
|
|
||||||
results['mismatches']['coordi_only_count'] +
|
|
||||||
results['mismatches']['kst_duplicates_count'] +
|
|
||||||
results['mismatches']['coordi_duplicates_count'])
|
|
||||||
|
|
||||||
# What the table will show
|
|
||||||
table_rows = results['mismatches']['kst_only_count'] + results['mismatches']['coordi_only_count']
|
|
||||||
|
|
||||||
print(f"\n📊 DISPLAY LOGIC:")
|
|
||||||
print(f" Count badge shows: {total_count} items (all different items)")
|
|
||||||
print(f" Table shows: {table_rows} rows (only KST-only + Coordi-only)")
|
|
||||||
print(f" Yellow highlights: Items that are also duplicates")
|
|
||||||
|
|
||||||
# Check for 백라이트 example
|
|
||||||
kst_only = results['mismatch_details']['kst_only']
|
|
||||||
kst_duplicates = results['mismatch_details']['kst_duplicates']
|
|
||||||
|
|
||||||
backlight_kst_only = [item for item in kst_only
|
|
||||||
if '백라이트' in item['title'] and '53-1x' in item['episode']]
|
|
||||||
backlight_kst_dup = [item for item in kst_duplicates
|
|
||||||
if '백라이트' in item['title'] and '53-1x' in item['episode']]
|
|
||||||
|
|
||||||
if backlight_kst_only and backlight_kst_dup:
|
|
||||||
print(f"\n✓ 백라이트 example works:")
|
|
||||||
print(f" - Appears in table (KST-only): YES")
|
|
||||||
print(f" - Will be highlighted yellow: YES (also duplicate)")
|
|
||||||
print(f" - Contributes to count: 2 items (1 KST-only + 1 duplicate)")
|
|
||||||
|
|
||||||
print(f"\n✓ Web interface ready at http://localhost:8081")
|
|
||||||
print("✓ Check the 'Different' tab:")
|
|
||||||
print(" - Count shows all different items")
|
|
||||||
print(" - Table shows only KST-only + Coordi-only")
|
|
||||||
print(" - Yellow rows = items that also have duplicates")
|
|
||||||
|
|
||||||
else:
|
|
||||||
print(f"✗ Analysis failed: {data.get('error')}")
|
|
||||||
else:
|
|
||||||
print(f"✗ Request failed: {response.status_code}")
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
print(f"✗ Request failed: {e}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_simplified_duplicates()
|
|
||||||
90
web_gui.py
90
web_gui.py
@ -42,7 +42,7 @@ def analyze_data():
|
|||||||
matched_items = list(categorization['matched_items'])
|
matched_items = list(categorization['matched_items'])
|
||||||
|
|
||||||
# Filter matched items by sheet if specified
|
# Filter matched items by sheet if specified
|
||||||
if sheet_filter and sheet_filter != 'All Sheets':
|
if sheet_filter:
|
||||||
matched_items = [item for item in matched_items if item.source_sheet == sheet_filter]
|
matched_items = [item for item in matched_items if item.source_sheet == sheet_filter]
|
||||||
|
|
||||||
# Format matched items for JSON (limit to first 500 for performance)
|
# Format matched items for JSON (limit to first 500 for performance)
|
||||||
@ -291,7 +291,7 @@ def create_templates_dir():
|
|||||||
<div class="file-input" style="margin-top: 10px;">
|
<div class="file-input" style="margin-top: 10px;">
|
||||||
<label for="sheetFilter">Sheet Filter:</label>
|
<label for="sheetFilter">Sheet Filter:</label>
|
||||||
<select id="sheetFilter" onchange="filterBySheet()" disabled>
|
<select id="sheetFilter" onchange="filterBySheet()" disabled>
|
||||||
<option value="All Sheets">All Sheets</option>
|
<!-- Options will be populated dynamically -->
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div id="status"></div>
|
<div id="status"></div>
|
||||||
@ -383,7 +383,7 @@ def create_templates_dir():
|
|||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
file_path: filePath,
|
file_path: filePath,
|
||||||
sheet_filter: sheetFilter === 'All Sheets' ? null : sheetFilter
|
sheet_filter: sheetFilter
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
@ -408,13 +408,14 @@ def create_templates_dir():
|
|||||||
|
|
||||||
function updateSheetFilter(sheetNames, currentFilter) {
|
function updateSheetFilter(sheetNames, currentFilter) {
|
||||||
const select = document.getElementById('sheetFilter');
|
const select = document.getElementById('sheetFilter');
|
||||||
select.innerHTML = '<option value="All Sheets">All Sheets</option>';
|
select.innerHTML = '';
|
||||||
|
|
||||||
sheetNames.forEach(sheetName => {
|
sheetNames.forEach((sheetName, index) => {
|
||||||
const option = document.createElement('option');
|
const option = document.createElement('option');
|
||||||
option.value = sheetName;
|
option.value = sheetName;
|
||||||
option.textContent = sheetName;
|
option.textContent = sheetName;
|
||||||
if (sheetName === currentFilter) {
|
// Select the first sheet by default, or the current filter if specified
|
||||||
|
if (sheetName === currentFilter || (!currentFilter && index === 0)) {
|
||||||
option.selected = true;
|
option.selected = true;
|
||||||
}
|
}
|
||||||
select.appendChild(option);
|
select.appendChild(option);
|
||||||
@ -480,7 +481,7 @@ def create_templates_dir():
|
|||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
file_path: data.file_path,
|
file_path: data.file_path,
|
||||||
sheet_filter: sheetFilter === 'All Sheets' ? null : sheetFilter
|
sheet_filter: sheetFilter
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@ -512,9 +513,10 @@ def create_templates_dir():
|
|||||||
// Update count displays
|
// Update count displays
|
||||||
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
|
document.getElementById('matched-count-display').textContent = results.matched_items_count.toLocaleString();
|
||||||
|
|
||||||
// Count all different items including duplicates
|
// Count all different items including duplicates and mixed duplicates
|
||||||
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
|
const totalDifferent = results.mismatches.kst_only_count + results.mismatches.coordi_only_count +
|
||||||
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count;
|
results.mismatches.kst_duplicates_count + results.mismatches.coordi_duplicates_count +
|
||||||
|
(results.mismatches.mixed_duplicates_count || 0);
|
||||||
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
|
document.getElementById('different-count-display').textContent = totalDifferent.toLocaleString();
|
||||||
|
|
||||||
// Update Summary tab (matched items)
|
// Update Summary tab (matched items)
|
||||||
@ -552,47 +554,70 @@ def create_templates_dir():
|
|||||||
const tbody = document.getElementById('different-table');
|
const tbody = document.getElementById('different-table');
|
||||||
tbody.innerHTML = '';
|
tbody.innerHTML = '';
|
||||||
|
|
||||||
// Create sets of duplicate items for highlighting
|
|
||||||
const kstDuplicateKeys = new Set();
|
|
||||||
const coordiDuplicateKeys = new Set();
|
|
||||||
|
|
||||||
mismatchDetails.kst_duplicates.forEach(item => {
|
|
||||||
kstDuplicateKeys.add(`${item.title}_${item.episode}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
mismatchDetails.coordi_duplicates.forEach(item => {
|
|
||||||
coordiDuplicateKeys.add(`${item.title}_${item.episode}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Combine only KST-only and Coordi-only items (like before)
|
|
||||||
const allDifferences = [];
|
const allDifferences = [];
|
||||||
|
|
||||||
// Add KST-only items
|
// Add KST-only items (no special highlighting)
|
||||||
mismatchDetails.kst_only.forEach(item => {
|
mismatchDetails.kst_only.forEach(item => {
|
||||||
const key = `${item.title}_${item.episode}`;
|
|
||||||
allDifferences.push({
|
allDifferences.push({
|
||||||
kstData: `${item.title} - Episode ${item.episode}`,
|
kstData: `${item.title} - Episode ${item.episode}`,
|
||||||
coordiData: '',
|
coordiData: '',
|
||||||
reason: 'Only appears in KST',
|
reason: 'Only appears in KST',
|
||||||
sortTitle: item.title,
|
sortTitle: item.title,
|
||||||
sortEpisode: parseFloat(item.episode) || 0,
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
isDuplicate: kstDuplicateKeys.has(key) // Check if this item is also a duplicate
|
highlightType: 'none'
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add Coordi-only items
|
// Add Coordi-only items (no special highlighting)
|
||||||
mismatchDetails.coordi_only.forEach(item => {
|
mismatchDetails.coordi_only.forEach(item => {
|
||||||
const key = `${item.title}_${item.episode}`;
|
|
||||||
allDifferences.push({
|
allDifferences.push({
|
||||||
kstData: '',
|
kstData: '',
|
||||||
coordiData: `${item.title} - Episode ${item.episode}`,
|
coordiData: `${item.title} - Episode ${item.episode}`,
|
||||||
reason: 'Only appears in Coordi',
|
reason: 'Only appears in Coordi',
|
||||||
sortTitle: item.title,
|
sortTitle: item.title,
|
||||||
sortEpisode: parseFloat(item.episode) || 0,
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
isDuplicate: coordiDuplicateKeys.has(key) // Check if this item is also a duplicate
|
highlightType: 'none'
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Add KST duplicates (red highlighting)
|
||||||
|
mismatchDetails.kst_duplicates.forEach(item => {
|
||||||
|
allDifferences.push({
|
||||||
|
kstData: `${item.title} - Episode ${item.episode}`,
|
||||||
|
coordiData: '',
|
||||||
|
reason: 'Duplicate entry in KST data',
|
||||||
|
sortTitle: item.title,
|
||||||
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
|
highlightType: 'red'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add Coordi duplicates (red highlighting)
|
||||||
|
mismatchDetails.coordi_duplicates.forEach(item => {
|
||||||
|
allDifferences.push({
|
||||||
|
kstData: '',
|
||||||
|
coordiData: `${item.title} - Episode ${item.episode}`,
|
||||||
|
reason: 'Duplicate entry in Coordi data',
|
||||||
|
sortTitle: item.title,
|
||||||
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
|
highlightType: 'red'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add mixed duplicates (yellow highlighting)
|
||||||
|
if (mismatchDetails.mixed_duplicates) {
|
||||||
|
mismatchDetails.mixed_duplicates.forEach(item => {
|
||||||
|
allDifferences.push({
|
||||||
|
kstData: item.duplicate_side === 'KST' ? `${item.title} - Episode ${item.episode}` : `${item.title} - Episode ${item.episode}`,
|
||||||
|
coordiData: item.duplicate_side === 'COORDI' ? `${item.title} - Episode ${item.episode}` : `${item.title} - Episode ${item.episode}`,
|
||||||
|
reason: item.reason,
|
||||||
|
sortTitle: item.title,
|
||||||
|
sortEpisode: parseFloat(item.episode) || 0,
|
||||||
|
highlightType: 'yellow'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Sort by Korean title + episode
|
// Sort by Korean title + episode
|
||||||
allDifferences.sort((a, b) => {
|
allDifferences.sort((a, b) => {
|
||||||
const titleCompare = a.sortTitle.localeCompare(b.sortTitle, 'ko');
|
const titleCompare = a.sortTitle.localeCompare(b.sortTitle, 'ko');
|
||||||
@ -607,10 +632,13 @@ def create_templates_dir():
|
|||||||
row.insertCell(1).textContent = diff.coordiData;
|
row.insertCell(1).textContent = diff.coordiData;
|
||||||
row.insertCell(2).textContent = diff.reason;
|
row.insertCell(2).textContent = diff.reason;
|
||||||
|
|
||||||
// Highlight row in yellow if it's also a duplicate
|
// Apply highlighting based on type
|
||||||
if (diff.isDuplicate) {
|
if (diff.highlightType === 'red') {
|
||||||
|
row.style.backgroundColor = '#f8d7da'; // Light red
|
||||||
|
row.title = 'Pure duplicate entry';
|
||||||
|
} else if (diff.highlightType === 'yellow') {
|
||||||
row.style.backgroundColor = '#fff3cd'; // Light yellow
|
row.style.backgroundColor = '#fff3cd'; // Light yellow
|
||||||
row.title = 'This item also has duplicates in the dataset';
|
row.title = 'Item exists in both datasets but has duplicates on one side';
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user