data-comparison/analyze_excel.py
2025-08-20 14:03:31 +07:00

92 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import os
from pathlib import Path
def analyze_excel_structure(file_path):
"""
Analyze the structure of an Excel file and return detailed information.
"""
if not os.path.exists(file_path):
return f"Error: File {file_path} does not exist"
try:
# Read Excel file to get sheet names
excel_file = pd.ExcelFile(file_path)
analysis = {
'file_path': file_path,
'file_size': os.path.getsize(file_path),
'sheet_names': excel_file.sheet_names,
'total_sheets': len(excel_file.sheet_names),
'sheets_analysis': {}
}
# Analyze each sheet
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
sheet_info = {
'dimensions': df.shape,
'columns': list(df.columns),
'column_count': len(df.columns),
'row_count': len(df),
'data_types': df.dtypes.to_dict(),
'missing_values': df.isnull().sum().to_dict(),
'sample_data': df.head(3).to_dict('records') if not df.empty else []
}
analysis['sheets_analysis'][sheet_name] = sheet_info
return analysis
except Exception as e:
return f"Error analyzing file: {str(e)}"
def print_analysis_report(analysis):
"""
Print a formatted report of the Excel file analysis.
"""
if isinstance(analysis, str):
print(analysis)
return
print("=" * 60)
print("EXCEL FILE STRUCTURE ANALYSIS")
print("=" * 60)
print(f"File: {analysis['file_path']}")
print(f"Size: {analysis['file_size']:,} bytes")
print(f"Total Sheets: {analysis['total_sheets']}")
print(f"Sheet Names: {', '.join(analysis['sheet_names'])}")
print()
for sheet_name, sheet_info in analysis['sheets_analysis'].items():
print(f"--- SHEET: {sheet_name} ---")
print(f"Dimensions: {sheet_info['dimensions'][0]} rows × {sheet_info['dimensions'][1]} columns")
print(f"Columns: {', '.join(sheet_info['columns'])}")
print()
print("Data Types:")
for col, dtype in sheet_info['data_types'].items():
print(f" {col}: {dtype}")
print()
print("Missing Values:")
for col, missing in sheet_info['missing_values'].items():
if missing > 0:
print(f" {col}: {missing} missing values")
print()
if sheet_info['sample_data']:
print("Sample Data (first 3 rows):")
for i, row in enumerate(sheet_info['sample_data'], 1):
print(f" Row {i}: {row}")
print()
if __name__ == "__main__":
# Analyze the sample Excel file
file_path = "data/sample-data.xlsx"
print("Starting Excel file analysis...")
analysis = analyze_excel_structure(file_path)
print_analysis_report(analysis)