data-comparison/analyze_excel.py

92 lines
3.0 KiB
Python
Raw Normal View History

2025-08-20 07:03:31 +00:00
import pandas as pd
import os
from pathlib import Path
def analyze_excel_structure(file_path):
"""
Analyze the structure of an Excel file and return detailed information.
"""
if not os.path.exists(file_path):
return f"Error: File {file_path} does not exist"
try:
# Read Excel file to get sheet names
excel_file = pd.ExcelFile(file_path)
analysis = {
'file_path': file_path,
'file_size': os.path.getsize(file_path),
'sheet_names': excel_file.sheet_names,
'total_sheets': len(excel_file.sheet_names),
'sheets_analysis': {}
}
# Analyze each sheet
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
sheet_info = {
'dimensions': df.shape,
'columns': list(df.columns),
'column_count': len(df.columns),
'row_count': len(df),
'data_types': df.dtypes.to_dict(),
'missing_values': df.isnull().sum().to_dict(),
'sample_data': df.head(3).to_dict('records') if not df.empty else []
}
analysis['sheets_analysis'][sheet_name] = sheet_info
return analysis
except Exception as e:
return f"Error analyzing file: {str(e)}"
def print_analysis_report(analysis):
"""
Print a formatted report of the Excel file analysis.
"""
if isinstance(analysis, str):
print(analysis)
return
print("=" * 60)
print("EXCEL FILE STRUCTURE ANALYSIS")
print("=" * 60)
print(f"File: {analysis['file_path']}")
print(f"Size: {analysis['file_size']:,} bytes")
print(f"Total Sheets: {analysis['total_sheets']}")
print(f"Sheet Names: {', '.join(analysis['sheet_names'])}")
print()
for sheet_name, sheet_info in analysis['sheets_analysis'].items():
print(f"--- SHEET: {sheet_name} ---")
print(f"Dimensions: {sheet_info['dimensions'][0]} rows × {sheet_info['dimensions'][1]} columns")
print(f"Columns: {', '.join(sheet_info['columns'])}")
print()
print("Data Types:")
for col, dtype in sheet_info['data_types'].items():
print(f" {col}: {dtype}")
print()
print("Missing Values:")
for col, missing in sheet_info['missing_values'].items():
if missing > 0:
print(f" {col}: {missing} missing values")
print()
if sheet_info['sample_data']:
print("Sample Data (first 3 rows):")
for i, row in enumerate(sheet_info['sample_data'], 1):
print(f" Row {i}: {row}")
print()
if __name__ == "__main__":
# Analyze the sample Excel file
file_path = "data/sample-data.xlsx"
print("Starting Excel file analysis...")
analysis = analyze_excel_structure(file_path)
print_analysis_report(analysis)