92 lines
3.0 KiB
Python
92 lines
3.0 KiB
Python
import pandas as pd
|
||
import os
|
||
from pathlib import Path
|
||
|
||
def analyze_excel_structure(file_path):
|
||
"""
|
||
Analyze the structure of an Excel file and return detailed information.
|
||
"""
|
||
if not os.path.exists(file_path):
|
||
return f"Error: File {file_path} does not exist"
|
||
|
||
try:
|
||
# Read Excel file to get sheet names
|
||
excel_file = pd.ExcelFile(file_path)
|
||
|
||
analysis = {
|
||
'file_path': file_path,
|
||
'file_size': os.path.getsize(file_path),
|
||
'sheet_names': excel_file.sheet_names,
|
||
'total_sheets': len(excel_file.sheet_names),
|
||
'sheets_analysis': {}
|
||
}
|
||
|
||
# Analyze each sheet
|
||
for sheet_name in excel_file.sheet_names:
|
||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||
|
||
sheet_info = {
|
||
'dimensions': df.shape,
|
||
'columns': list(df.columns),
|
||
'column_count': len(df.columns),
|
||
'row_count': len(df),
|
||
'data_types': df.dtypes.to_dict(),
|
||
'missing_values': df.isnull().sum().to_dict(),
|
||
'sample_data': df.head(3).to_dict('records') if not df.empty else []
|
||
}
|
||
|
||
analysis['sheets_analysis'][sheet_name] = sheet_info
|
||
|
||
return analysis
|
||
|
||
except Exception as e:
|
||
return f"Error analyzing file: {str(e)}"
|
||
|
||
def print_analysis_report(analysis):
|
||
"""
|
||
Print a formatted report of the Excel file analysis.
|
||
"""
|
||
if isinstance(analysis, str):
|
||
print(analysis)
|
||
return
|
||
|
||
print("=" * 60)
|
||
print("EXCEL FILE STRUCTURE ANALYSIS")
|
||
print("=" * 60)
|
||
|
||
print(f"File: {analysis['file_path']}")
|
||
print(f"Size: {analysis['file_size']:,} bytes")
|
||
print(f"Total Sheets: {analysis['total_sheets']}")
|
||
print(f"Sheet Names: {', '.join(analysis['sheet_names'])}")
|
||
print()
|
||
|
||
for sheet_name, sheet_info in analysis['sheets_analysis'].items():
|
||
print(f"--- SHEET: {sheet_name} ---")
|
||
print(f"Dimensions: {sheet_info['dimensions'][0]} rows × {sheet_info['dimensions'][1]} columns")
|
||
print(f"Columns: {', '.join(sheet_info['columns'])}")
|
||
print()
|
||
|
||
print("Data Types:")
|
||
for col, dtype in sheet_info['data_types'].items():
|
||
print(f" {col}: {dtype}")
|
||
print()
|
||
|
||
print("Missing Values:")
|
||
for col, missing in sheet_info['missing_values'].items():
|
||
if missing > 0:
|
||
print(f" {col}: {missing} missing values")
|
||
print()
|
||
|
||
if sheet_info['sample_data']:
|
||
print("Sample Data (first 3 rows):")
|
||
for i, row in enumerate(sheet_info['sample_data'], 1):
|
||
print(f" Row {i}: {row}")
|
||
print()
|
||
|
||
if __name__ == "__main__":
|
||
# Analyze the sample Excel file
|
||
file_path = "data/sample-data.xlsx"
|
||
|
||
print("Starting Excel file analysis...")
|
||
analysis = analyze_excel_structure(file_path)
|
||
print_analysis_report(analysis) |