| import pandas as pd | |
| import numpy as np | |
| from scipy import stats | |
| class DataAnalyzer: | |
| def analyze(self, data): | |
| insights = {} | |
| # Basic statistics | |
| insights['basic_stats'] = data.describe().to_dict() | |
| # Correlation analysis | |
| numeric_columns = data.select_dtypes(include=[np.number]).columns | |
| if len(numeric_columns) > 1: | |
| correlation_matrix = data[numeric_columns].corr() | |
| insights['correlations'] = correlation_matrix.to_dict() | |
| # Skewness and kurtosis | |
| skewness = data[numeric_columns].skew() | |
| kurtosis = data[numeric_columns].kurtosis() | |
| insights['distribution'] = { | |
| 'skewness': skewness.to_dict(), | |
| 'kurtosis': kurtosis.to_dict() | |
| } | |
| # Categorical data analysis | |
| categorical_columns = data.select_dtypes(include=['object']).columns | |
| for column in categorical_columns: | |
| insights[f'{column}_distribution'] = data[column].value_counts().to_dict() | |
| # Check for normality | |
| normality_tests = {} | |
| for column in numeric_columns: | |
| _, p_value = stats.normaltest(data[column].dropna()) | |
| normality_tests[column] = { | |
| 'is_normal': p_value > 0.05, | |
| 'p_value': p_value | |
| } | |
| insights['normality_tests'] = normality_tests | |
| return insights |