92 lines
3.0 KiB
Python
92 lines
3.0 KiB
Python
|
|
import pandas as pd
|
|||
|
|
import os
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
def analyze_excel_structure(file_path):
|
|||
|
|
"""
|
|||
|
|
Analyze the structure of an Excel file and return detailed information.
|
|||
|
|
"""
|
|||
|
|
if not os.path.exists(file_path):
|
|||
|
|
return f"Error: File {file_path} does not exist"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Read Excel file to get sheet names
|
|||
|
|
excel_file = pd.ExcelFile(file_path)
|
|||
|
|
|
|||
|
|
analysis = {
|
|||
|
|
'file_path': file_path,
|
|||
|
|
'file_size': os.path.getsize(file_path),
|
|||
|
|
'sheet_names': excel_file.sheet_names,
|
|||
|
|
'total_sheets': len(excel_file.sheet_names),
|
|||
|
|
'sheets_analysis': {}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Analyze each sheet
|
|||
|
|
for sheet_name in excel_file.sheet_names:
|
|||
|
|
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|||
|
|
|
|||
|
|
sheet_info = {
|
|||
|
|
'dimensions': df.shape,
|
|||
|
|
'columns': list(df.columns),
|
|||
|
|
'column_count': len(df.columns),
|
|||
|
|
'row_count': len(df),
|
|||
|
|
'data_types': df.dtypes.to_dict(),
|
|||
|
|
'missing_values': df.isnull().sum().to_dict(),
|
|||
|
|
'sample_data': df.head(3).to_dict('records') if not df.empty else []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
analysis['sheets_analysis'][sheet_name] = sheet_info
|
|||
|
|
|
|||
|
|
return analysis
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"Error analyzing file: {str(e)}"
|
|||
|
|
|
|||
|
|
def print_analysis_report(analysis):
|
|||
|
|
"""
|
|||
|
|
Print a formatted report of the Excel file analysis.
|
|||
|
|
"""
|
|||
|
|
if isinstance(analysis, str):
|
|||
|
|
print(analysis)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("EXCEL FILE STRUCTURE ANALYSIS")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
print(f"File: {analysis['file_path']}")
|
|||
|
|
print(f"Size: {analysis['file_size']:,} bytes")
|
|||
|
|
print(f"Total Sheets: {analysis['total_sheets']}")
|
|||
|
|
print(f"Sheet Names: {', '.join(analysis['sheet_names'])}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
for sheet_name, sheet_info in analysis['sheets_analysis'].items():
|
|||
|
|
print(f"--- SHEET: {sheet_name} ---")
|
|||
|
|
print(f"Dimensions: {sheet_info['dimensions'][0]} rows × {sheet_info['dimensions'][1]} columns")
|
|||
|
|
print(f"Columns: {', '.join(sheet_info['columns'])}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("Data Types:")
|
|||
|
|
for col, dtype in sheet_info['data_types'].items():
|
|||
|
|
print(f" {col}: {dtype}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("Missing Values:")
|
|||
|
|
for col, missing in sheet_info['missing_values'].items():
|
|||
|
|
if missing > 0:
|
|||
|
|
print(f" {col}: {missing} missing values")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
if sheet_info['sample_data']:
|
|||
|
|
print("Sample Data (first 3 rows):")
|
|||
|
|
for i, row in enumerate(sheet_info['sample_data'], 1):
|
|||
|
|
print(f" Row {i}: {row}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# Analyze the sample Excel file
|
|||
|
|
file_path = "data/sample-data.xlsx"
|
|||
|
|
|
|||
|
|
print("Starting Excel file analysis...")
|
|||
|
|
analysis = analyze_excel_structure(file_path)
|
|||
|
|
print_analysis_report(analysis)
|