import pandas as pd import os from pathlib import Path def analyze_excel_structure(file_path): """ Analyze the structure of an Excel file and return detailed information. """ if not os.path.exists(file_path): return f"Error: File {file_path} does not exist" try: # Read Excel file to get sheet names excel_file = pd.ExcelFile(file_path) analysis = { 'file_path': file_path, 'file_size': os.path.getsize(file_path), 'sheet_names': excel_file.sheet_names, 'total_sheets': len(excel_file.sheet_names), 'sheets_analysis': {} } # Analyze each sheet for sheet_name in excel_file.sheet_names: df = pd.read_excel(file_path, sheet_name=sheet_name) sheet_info = { 'dimensions': df.shape, 'columns': list(df.columns), 'column_count': len(df.columns), 'row_count': len(df), 'data_types': df.dtypes.to_dict(), 'missing_values': df.isnull().sum().to_dict(), 'sample_data': df.head(3).to_dict('records') if not df.empty else [] } analysis['sheets_analysis'][sheet_name] = sheet_info return analysis except Exception as e: return f"Error analyzing file: {str(e)}" def print_analysis_report(analysis): """ Print a formatted report of the Excel file analysis. """ if isinstance(analysis, str): print(analysis) return print("=" * 60) print("EXCEL FILE STRUCTURE ANALYSIS") print("=" * 60) print(f"File: {analysis['file_path']}") print(f"Size: {analysis['file_size']:,} bytes") print(f"Total Sheets: {analysis['total_sheets']}") print(f"Sheet Names: {', '.join(analysis['sheet_names'])}") print() for sheet_name, sheet_info in analysis['sheets_analysis'].items(): print(f"--- SHEET: {sheet_name} ---") print(f"Dimensions: {sheet_info['dimensions'][0]} rows × {sheet_info['dimensions'][1]} columns") print(f"Columns: {', '.join(sheet_info['columns'])}") print() print("Data Types:") for col, dtype in sheet_info['data_types'].items(): print(f" {col}: {dtype}") print() print("Missing Values:") for col, missing in sheet_info['missing_values'].items(): if missing > 0: print(f" {col}: {missing} missing values") print() if sheet_info['sample_data']: print("Sample Data (first 3 rows):") for i, row in enumerate(sheet_info['sample_data'], 1): print(f" Row {i}: {row}") print() if __name__ == "__main__": # Analyze the sample Excel file file_path = "data/sample-data.xlsx" print("Starting Excel file analysis...") analysis = analyze_excel_structure(file_path) print_analysis_report(analysis)