CCockrum commited on
Commit
ae9e6ae
·
verified ·
1 Parent(s): 284f32f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1293 -0
app.py ADDED
@@ -0,0 +1,1293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from sklearn.linear_model import LinearRegression
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import mean_squared_error, r2_score
8
+ from scipy import stats
9
+ import re
10
+ import json
11
+ import os
12
+ import sqlite3
13
+ from datetime import datetime
14
+
15
+ class DataAnalysisChatbot:
16
+ def __init__(self):
17
+ self.data = None
18
+ self.data_source = None
19
+ self.conversation_history = []
20
+ self.available_commands = {
21
+ "load": self.load_data,
22
+ "info": self.get_data_info,
23
+ "describe": self.describe_data,
24
+ "missing": self.check_missing_values,
25
+ "correlate": self.correlation_analysis,
26
+ "visualize": self.visualize_data,
27
+ "analyze": self.analyze_column,
28
+ "trend": self.analyze_trend,
29
+ "outliers": self.detect_outliers,
30
+ "predict": self.predictive_analysis,
31
+ "test": self.hypothesis_testing,
32
+ "report": self.generate_report,
33
+ "help": self.get_help
34
+ }
35
+
36
+ def process_query(self, query):
37
+ """Process user query and route to appropriate function"""
38
+ # Add the user query to conversation history
39
+ self.conversation_history.append({"role": "user", "message": query, "timestamp": datetime.now()})
40
+
41
+ # Check if data is loaded (except for load command and help)
42
+ if self.data is None and not any(cmd in query.lower() for cmd in ["load", "help"]):
43
+ response = "Please load data first using the 'load' command. Example: load csv path/to/file.csv"
44
+ self._add_to_history(response)
45
+ return response
46
+
47
+ # Parse the command
48
+ command = self._extract_command(query)
49
+
50
+ if command in self.available_commands:
51
+ response = self.available_commands[command](query)
52
+ else:
53
+ # Natural language understanding would go here
54
+ # For now, use simple keyword matching
55
+ if "mean" in query.lower() or "average" in query.lower():
56
+ response = self.analyze_column(query)
57
+ elif "correlate" in query.lower() or "relationship" in query.lower():
58
+ response = self.correlation_analysis(query)
59
+ elif "visual" in query.lower() or "plot" in query.lower() or "chart" in query.lower() or "graph" in query.lower():
60
+ response = self.visualize_data(query)
61
+ else:
62
+ response = "I'm not sure how to process that query. Type 'help' for available commands."
63
+
64
+ self._add_to_history(response)
65
+ return response
66
+
67
+ def _extract_command(self, query):
68
+ """Extract the main command from the query"""
69
+ words = query.lower().split()
70
+ for word in words:
71
+ if word in self.available_commands:
72
+ return word
73
+ return None
74
+
75
+ def _add_to_history(self, response):
76
+ """Add bot response to conversation history"""
77
+ self.conversation_history.append({"role": "bot", "message": response, "timestamp": datetime.now()})
78
+
79
+ def _extract_column_names(self, query):
80
+ """Extract column names mentioned in the query"""
81
+ if self.data is None:
82
+ return []
83
+
84
+ columns = []
85
+ for col in self.data.columns:
86
+ if col.lower() in query.lower():
87
+ columns.append(col)
88
+
89
+ return columns
90
+
91
+ # DATA ACCESS AND RETRIEVAL
92
+
93
+ def load_data(self, query):
94
+ """Load data from various sources"""
95
+ query_lower = query.lower()
96
+
97
+ # CSV Loading
98
+ if "csv" in query_lower:
99
+ match = re.search(r'load\s+csv\s+(.+?)(?:\s|$)', query)
100
+ if match:
101
+ file_path = match.group(1)
102
+ try:
103
+ self.data = pd.read_csv(file_path)
104
+ self.data_source = f"CSV: {file_path}"
105
+ return f"Successfully loaded data from {file_path}. {len(self.data)} rows and {len(self.data.columns)} columns found."
106
+ except Exception as e:
107
+ return f"Error loading CSV file: {str(e)}"
108
+
109
+ # Excel Loading
110
+ elif "excel" in query_lower or "xlsx" in query_lower:
111
+ match = re.search(r'load\s+(?:excel|xlsx)\s+(.+?)(?:\s|$)', query)
112
+ if match:
113
+ file_path = match.group(1)
114
+ try:
115
+ self.data = pd.read_excel(file_path)
116
+ self.data_source = f"Excel: {file_path}"
117
+ return f"Successfully loaded data from Excel file {file_path}. {len(self.data)} rows and {len(self.data.columns)} columns found."
118
+ except Exception as e:
119
+ return f"Error loading Excel file: {str(e)}"
120
+
121
+ # SQL Database Loading
122
+ elif "sql" in query_lower or "database" in query_lower:
123
+ # Extract database path and query using regex
124
+ db_match = re.search(r'load\s+(?:sql|database)\s+(.+?)\s+query\s+(.+?)(?:\s|$)', query, re.IGNORECASE | re.DOTALL)
125
+ if db_match:
126
+ db_path = db_match.group(1)
127
+ sql_query = db_match.group(2)
128
+ try:
129
+ conn = sqlite3.connect(db_path)
130
+ self.data = pd.read_sql_query(sql_query, conn)
131
+ conn.close()
132
+ self.data_source = f"SQL: {db_path}, Query: {sql_query}"
133
+ return f"Successfully loaded data from SQL query. {len(self.data)} rows and {len(self.data.columns)} columns found."
134
+ except Exception as e:
135
+ return f"Error executing SQL query: {str(e)}"
136
+
137
+ # JSON Loading
138
+ elif "json" in query_lower:
139
+ match = re.search(r'load\s+json\s+(.+?)(?:\s|$)', query)
140
+ if match:
141
+ file_path = match.group(1)
142
+ try:
143
+ with open(file_path, 'r') as f:
144
+ json_data = json.load(f)
145
+ self.data = pd.json_normalize(json_data)
146
+ self.data_source = f"JSON: {file_path}"
147
+ return f"Successfully loaded data from JSON file {file_path}. {len(self.data)} rows and {len(self.data.columns)} columns found."
148
+ except Exception as e:
149
+ return f"Error loading JSON file: {str(e)}"
150
+
151
+ return "Please specify the data source format and path. Example: 'load csv data.csv' or 'load sql database.db query SELECT * FROM table'"
152
+
153
+ def get_data_info(self, query):
154
+ """Get basic information about the loaded data"""
155
+ if self.data is None:
156
+ return "No data loaded. Please load data first."
157
+
158
+ info = f"Data Source: {self.data_source}\n"
159
+ info += f"Rows: {len(self.data)}\n"
160
+ info += f"Columns: {len(self.data.columns)}\n"
161
+ info += f"Column Names: {', '.join(self.data.columns)}\n"
162
+ info += f"Data Types:\n{self.data.dtypes.to_string()}\n"
163
+
164
+ memory_usage = self.data.memory_usage(deep=True).sum()
165
+ if memory_usage < 1024:
166
+ memory_str = f"{memory_usage} bytes"
167
+ elif memory_usage < 1024 * 1024:
168
+ memory_str = f"{memory_usage / 1024:.2f} KB"
169
+ else:
170
+ memory_str = f"{memory_usage / (1024 * 1024):.2f} MB"
171
+
172
+ info += f"Memory Usage: {memory_str}"
173
+
174
+ return info
175
+
176
+ def describe_data(self, query):
177
+ """Provide descriptive statistics for the data"""
178
+ if self.data is None:
179
+ return "No data loaded. Please load data first."
180
+
181
+ # Check if specific columns are mentioned
182
+ columns = self._extract_column_names(query)
183
+
184
+ if columns:
185
+ try:
186
+ desc = self.data[columns].describe().to_string()
187
+ return f"Descriptive statistics for columns {', '.join(columns)}:\n{desc}"
188
+ except Exception as e:
189
+ return f"Error generating descriptive statistics: {str(e)}"
190
+ else:
191
+ # If no specific columns mentioned, describe all numeric columns
192
+ numeric_cols = self.data.select_dtypes(include=['number']).columns.tolist()
193
+ if not numeric_cols:
194
+ return "No numeric columns found in the data for descriptive statistics."
195
+
196
+ desc = self.data[numeric_cols].describe().to_string()
197
+ return f"Descriptive statistics for all numeric columns:\n{desc}"
198
+
199
+ def check_missing_values(self, query):
200
+ """Check for missing values in the data"""
201
+ if self.data is None:
202
+ return "No data loaded. Please load data first."
203
+
204
+ missing_values = self.data.isnull().sum()
205
+ missing_percentage = (missing_values / len(self.data) * 100).round(2)
206
+
207
+ result = "Missing Values Analysis:\n"
208
+ for col, count in missing_values.items():
209
+ if count > 0:
210
+ result += f"{col}: {count} missing values ({missing_percentage[col]}%)\n"
211
+
212
+ if not any(missing_values > 0):
213
+ result += "No missing values found in the dataset."
214
+ else:
215
+ total_missing = missing_values.sum()
216
+ total_cells = self.data.size
217
+ overall_percentage = (total_missing / total_cells * 100).round(2)
218
+ result += f"\nOverall: {total_missing} missing values out of {total_cells} cells ({overall_percentage}%)"
219
+
220
+ return result
221
+
222
+ # DATA ANALYSIS AND INTERPRETATION
223
+
224
+ def analyze_column(self, query):
225
+ """Analyze a specific column"""
226
+ if self.data is None:
227
+ return "No data loaded. Please load data first."
228
+
229
+ columns = self._extract_column_names(query)
230
+
231
+ if not columns:
232
+ return "Please specify a column name to analyze. Available columns: " + ", ".join(self.data.columns)
233
+
234
+ column = columns[0] # Take the first column mentioned
235
+
236
+ try:
237
+ col_data = self.data[column]
238
+
239
+ if pd.api.types.is_numeric_dtype(col_data):
240
+ # Numeric column analysis
241
+ stats = {
242
+ "Count": len(col_data),
243
+ "Missing": col_data.isnull().sum(),
244
+ "Mean": col_data.mean(),
245
+ "Median": col_data.median(),
246
+ "Mode": col_data.mode()[0] if not col_data.mode().empty else None,
247
+ "Std Dev": col_data.std(),
248
+ "Min": col_data.min(),
249
+ "Max": col_data.max(),
250
+ "25%": col_data.quantile(0.25),
251
+ "75%": col_data.quantile(0.75),
252
+ "Skewness": col_data.skew(),
253
+ "Kurtosis": col_data.kurt()
254
+ }
255
+
256
+ result = f"Analysis of column '{column}' (Numeric):\n"
257
+ for stat_name, stat_value in stats.items():
258
+ if isinstance(stat_value, float):
259
+ result += f"{stat_name}: {stat_value:.4f}\n"
260
+ else:
261
+ result += f"{stat_name}: {stat_value}\n"
262
+
263
+ # Check for outliers using IQR method
264
+ Q1 = stats["25%"]
265
+ Q3 = stats["75%"]
266
+ IQR = Q3 - Q1
267
+ lower_bound = Q1 - 1.5 * IQR
268
+ upper_bound = Q3 + 1.5 * IQR
269
+ outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
270
+
271
+ result += f"Outliers (IQR method): {len(outliers)} found\n"
272
+
273
+ # Add histogram data as ASCII art or description
274
+ hist_data = np.histogram(col_data.dropna(), bins=10)
275
+ result += "\nDistribution Summary:\n"
276
+ max_count = max(hist_data[0])
277
+ for i, count in enumerate(hist_data[0]):
278
+ bin_start = f"{hist_data[1][i]:.2f}"
279
+ bin_end = f"{hist_data[1][i+1]:.2f}"
280
+ bar_length = int((count / max_count) * 20)
281
+ result += f"{bin_start} to {bin_end}: {'#' * bar_length} ({count})\n"
282
+
283
+ else:
284
+ # Categorical column analysis
285
+ value_counts = col_data.value_counts()
286
+ top_n = min(10, len(value_counts))
287
+
288
+ result = f"Analysis of column '{column}' (Categorical):\n"
289
+ result += f"Count: {len(col_data)}\n"
290
+ result += f"Missing: {col_data.isnull().sum()}\n"
291
+ result += f"Unique Values: {col_data.nunique()}\n"
292
+
293
+ result += f"\nTop {top_n} values:\n"
294
+ for value, count in value_counts.head(top_n).items():
295
+ percentage = (count / len(col_data)) * 100
296
+ result += f"{value}: {count} ({percentage:.2f}%)\n"
297
+
298
+ return result
299
+
300
+ except Exception as e:
301
+ return f"Error analyzing column '{column}': {str(e)}"
302
+
303
+ def correlation_analysis(self, query):
304
+ """Analyze correlations between columns"""
305
+ if self.data is None:
306
+ return "No data loaded. Please load data first."
307
+
308
+ # Extract specific columns if mentioned
309
+ columns = self._extract_column_names(query)
310
+
311
+ # If no specific columns or fewer than 2 columns mentioned, use all numeric columns
312
+ if len(columns) < 2:
313
+ numeric_columns = self.data.select_dtypes(include=['number']).columns.tolist()
314
+ if len(numeric_columns) < 2:
315
+ return "Not enough numeric columns for correlation analysis."
316
+
317
+ # If we found numeric columns but none were specified, use all numeric
318
+ if not columns:
319
+ columns = numeric_columns
320
+ # If one was specified, find its highest correlations
321
+ elif len(columns) == 1:
322
+ target_col = columns[0]
323
+ if target_col not in numeric_columns:
324
+ return f"Column '{target_col}' is not numeric and cannot be used for correlation analysis."
325
+
326
+ # Get correlations with target column
327
+ corr_matrix = self.data[numeric_columns].corr()
328
+ target_corr = corr_matrix[target_col].sort_values(ascending=False)
329
+
330
+ result = f"Correlation analysis for '{target_col}':\n"
331
+ for col, corr_val in target_corr.items():
332
+ if col != target_col:
333
+ strength = ""
334
+ if abs(corr_val) > 0.7:
335
+ strength = "Strong"
336
+ elif abs(corr_val) > 0.3:
337
+ strength = "Moderate"
338
+ else:
339
+ strength = "Weak"
340
+
341
+ direction = "positive" if corr_val > 0 else "negative"
342
+ result += f"{col}: {corr_val:.4f} ({strength} {direction} correlation)\n"
343
+
344
+ return result
345
+
346
+ try:
347
+ # Calculate correlations between specified columns
348
+ corr_matrix = self.data[columns].corr()
349
+
350
+ result = "Correlation Matrix:\n"
351
+ result += corr_matrix.to_string()
352
+
353
+ # Find strongest correlations
354
+ corr_pairs = []
355
+ for i in range(len(columns)):
356
+ for j in range(i+1, len(columns)):
357
+ col1, col2 = columns[i], columns[j]
358
+ corr_val = corr_matrix.loc[col1, col2]
359
+ corr_pairs.append((col1, col2, corr_val))
360
+
361
+ # Sort by absolute correlation value
362
+ corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
363
+
364
+ result += "\n\nStrongest Correlations:\n"
365
+ for col1, col2, corr_val in corr_pairs:
366
+ strength = ""
367
+ if abs(corr_val) > 0.7:
368
+ strength = "Strong"
369
+ elif abs(corr_val) > 0.3:
370
+ strength = "Moderate"
371
+ else:
372
+ strength = "Weak"
373
+
374
+ direction = "positive" if corr_val > 0 else "negative"
375
+ result += f"{col1} vs {col2}: {corr_val:.4f} ({strength} {direction} correlation)\n"
376
+
377
+ return result
378
+
379
+ except Exception as e:
380
+ return f"Error performing correlation analysis: {str(e)}"
381
+
382
+ def visualize_data(self, query):
383
+ """Generate visualizations based on data"""
384
+ if self.data is None:
385
+ return "No data loaded. Please load data first."
386
+
387
+ # Extract columns from query
388
+ columns = self._extract_column_names(query)
389
+
390
+ # Determine visualization type from query
391
+ viz_type = None
392
+ if "scatter" in query.lower():
393
+ viz_type = "scatter"
394
+ elif "histogram" in query.lower() or "distribution" in query.lower():
395
+ viz_type = "histogram"
396
+ elif "box" in query.lower():
397
+ viz_type = "box"
398
+ elif "bar" in query.lower():
399
+ viz_type = "bar"
400
+ elif "pie" in query.lower():
401
+ viz_type = "pie"
402
+ elif "heatmap" in query.lower() or "correlation" in query.lower():
403
+ viz_type = "heatmap"
404
+ elif "line" in query.lower() or "trend" in query.lower():
405
+ viz_type = "line"
406
+ else:
407
+ # Default to bar chart for one column, scatter for two
408
+ if len(columns) == 1:
409
+ viz_type = "bar"
410
+ elif len(columns) >= 2:
411
+ viz_type = "scatter"
412
+ else:
413
+ return "Please specify columns and visualization type (scatter, histogram, box, bar, pie, heatmap, line)"
414
+
415
+ try:
416
+ plt.figure(figsize=(10, 6))
417
+
418
+ if viz_type == "scatter" and len(columns) >= 2:
419
+ plt.scatter(self.data[columns[0]], self.data[columns[1]])
420
+ plt.xlabel(columns[0])
421
+ plt.ylabel(columns[1])
422
+ plt.title(f"Scatter Plot: {columns[0]} vs {columns[1]}")
423
+
424
+ # Add regression line
425
+ if len(self.data) > 2: # Need at least 3 points for meaningful regression
426
+ x = self.data[columns[0]].values.reshape(-1, 1)
427
+ y = self.data[columns[1]].values
428
+ model = LinearRegression()
429
+ model.fit(x, y)
430
+ plt.plot(x, model.predict(x), color='red', linewidth=2)
431
+
432
+ # Add correlation coefficient
433
+ corr = self.data[columns].corr().loc[columns[0], columns[1]]
434
+ plt.annotate(f"r = {corr:.4f}", xy=(0.05, 0.95), xycoords='axes fraction')
435
+
436
+ elif viz_type == "histogram" and columns:
437
+ sns.histplot(self.data[columns[0]], kde=True)
438
+ plt.xlabel(columns[0])
439
+ plt.ylabel("Frequency")
440
+ plt.title(f"Histogram of {columns[0]}")
441
+
442
+ elif viz_type == "box" and columns:
443
+ if len(columns) == 1:
444
+ sns.boxplot(y=self.data[columns[0]])
445
+ plt.ylabel(columns[0])
446
+ else:
447
+ plt.boxplot([self.data[col].dropna() for col in columns])
448
+ plt.xticks(range(1, len(columns) + 1), columns, rotation=45)
449
+ plt.title(f"Box Plot of {', '.join(columns)}")
450
+
451
+ elif viz_type == "bar" and columns:
452
+ if len(columns) == 1:
453
+ # For a single column, show value counts
454
+ value_counts = self.data[columns[0]].value_counts().nlargest(15)
455
+ value_counts.plot(kind='bar')
456
+ plt.xlabel(columns[0])
457
+ plt.ylabel("Count")
458
+ plt.title(f"Bar Chart of {columns[0]} (Top 15 Categories)")
459
+ else:
460
+ # For multiple columns, show means
461
+ self.data[columns].mean().plot(kind='bar')
462
+ plt.ylabel("Mean Value")
463
+ plt.title(f"Mean Values of {', '.join(columns)}")
464
+
465
+ elif viz_type == "pie" and columns:
466
+ # Only use first column for pie chart
467
+ value_counts = self.data[columns[0]].value_counts().nlargest(10)
468
+ plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%')
469
+ plt.title(f"Pie Chart of {columns[0]} (Top 10 Categories)")
470
+
471
+ elif viz_type == "heatmap":
472
+ # Use numeric columns for heatmap
473
+ if not columns:
474
+ columns = self.data.select_dtypes(include=['number']).columns.tolist()
475
+
476
+ if len(columns) < 2:
477
+ return "Need at least 2 numeric columns for heatmap."
478
+
479
+ corr_matrix = self.data[columns].corr()
480
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
481
+ plt.title("Correlation Heatmap")
482
+
483
+ elif viz_type == "line" and columns:
484
+ # Check if there's a datetime column to use as index
485
+ datetime_cols = [col for col in self.data.columns if pd.api.types.is_datetime64_dtype(self.data[col])]
486
+
487
+ if datetime_cols and len(columns) >= 1:
488
+ time_col = datetime_cols[0]
489
+ for col in columns:
490
+ if col != time_col:
491
+ plt.plot(self.data[time_col], self.data[col], label=col)
492
+ plt.xlabel(time_col)
493
+ plt.legend()
494
+ else:
495
+ # No datetime column, just plot the values
496
+ for col in columns:
497
+ plt.plot(self.data[col], label=col)
498
+ plt.legend()
499
+
500
+ plt.title(f"Line Plot of {', '.join(columns)}")
501
+
502
+ # Save figure to a temporary file
503
+ temp_file = f"temp_viz_{datetime.now().strftime('%Y%m%d%H%M%S')}.png"
504
+ plt.tight_layout()
505
+ plt.savefig(temp_file)
506
+ plt.close()
507
+
508
+ return f"Visualization created and saved as {temp_file}"
509
+
510
+ except Exception as e:
511
+ plt.close() # Close any open figures in case of error
512
+ return f"Error creating visualization: {str(e)}"
513
+
514
+ def analyze_trend(self, query):
515
+ """Analyze trends over time or sequence"""
516
+ if self.data is None:
517
+ return "No data loaded. Please load data first."
518
+
519
+ # Extract columns from query
520
+ columns = self._extract_column_names(query)
521
+
522
+ if len(columns) < 1:
523
+ return "Please specify at least one column to analyze for trends."
524
+
525
+ try:
526
+ result = "Trend Analysis:\n"
527
+
528
+ # Look for a date/time column
529
+ date_columns = []
530
+ for col in self.data.columns:
531
+ if pd.api.types.is_datetime64_dtype(self.data[col]):
532
+ date_columns.append(col)
533
+ elif any(date_term in col.lower() for date_term in ["date", "time", "year", "month", "day"]):
534
+ try:
535
+ # Try to convert to datetime
536
+ pd.to_datetime(self.data[col])
537
+ date_columns.append(col)
538
+ except:
539
+ pass
540
+
541
+ # If we found date columns, use the first one
542
+ if date_columns:
543
+ time_col = date_columns[0]
544
+ result += f"Using {time_col} as the time variable.\n\n"
545
+
546
+ # Convert to datetime if not already
547
+ if not pd.api.types.is_datetime64_dtype(self.data[time_col]):
548
+ self.data[time_col] = pd.to_datetime(self.data[time_col], errors='coerce')
549
+
550
+ # Sort by time
551
+ data_sorted = self.data.sort_values(by=time_col)
552
+
553
+ for col in columns:
554
+ if col == time_col:
555
+ continue
556
+
557
+ if not pd.api.types.is_numeric_dtype(self.data[col]):
558
+ result += f"Skipping non-numeric column {col}\n"
559
+ continue
560
+
561
+ # Calculate trend statistics
562
+ result += f"Trend for {col}:\n"
563
+
564
+ # Calculate overall change
565
+ first_val = data_sorted[col].iloc[0]
566
+ last_val = data_sorted[col].iloc[-1]
567
+ total_change = last_val - first_val
568
+ pct_change = (total_change / first_val * 100) if first_val != 0 else float('inf')
569
+
570
+ result += f" Starting value: {first_val}\n"
571
+ result += f" Ending value: {last_val}\n"
572
+ result += f" Total change: {total_change} ({pct_change:.2f}%)\n"
573
+
574
+ # Perform trend analysis with linear regression
575
+ x = np.arange(len(data_sorted)).reshape(-1, 1)
576
+ y = data_sorted[col].values
577
+
578
+ # Handle missing values
579
+ mask = ~np.isnan(y)
580
+ x_clean = x[mask]
581
+ y_clean = y[mask]
582
+
583
+ if len(y_clean) >= 2: # Need at least 2 points for regression
584
+ model = LinearRegression()
585
+ model.fit(x_clean, y_clean)
586
+
587
+ slope = model.coef_[0]
588
+ avg_val = np.mean(y_clean)
589
+ result += f" Trend slope: {slope:.4f} per time unit\n"
590
+ result += f" Relative trend: {slope / avg_val * 100:.2f}% of mean per time unit\n"
591
+
592
+ # Determine if trend is significant
593
+ if abs(slope / avg_val) > 0.01:
594
+ direction = "increasing" if slope > 0 else "decreasing"
595
+ strength = "strongly" if abs(slope / avg_val) > 0.05 else "moderately"
596
+ result += f" The {col} is {strength} {direction} over time.\n"
597
+ else:
598
+ result += f" The {col} shows little change over time.\n"
599
+
600
+ # R-squared to show fit quality
601
+ y_pred = model.predict(x_clean)
602
+ r2 = r2_score(y_clean, y_pred)
603
+ result += f" R-squared: {r2:.4f} (higher means more consistent trend)\n"
604
+
605
+ # Calculate periodicity if enough data points
606
+ if len(y_clean) >= 4:
607
+ result += self._check_seasonality(y_clean)
608
+
609
+ result += "\n"
610
+ else:
611
+ # No date column found, use sequence order
612
+ result += "No date/time column found. Analyzing trends based on sequence order.\n\n"
613
+
614
+ for col in columns:
615
+ if not pd.api.types.is_numeric_dtype(self.data[col]):
616
+ result += f"Skipping non-numeric column {col}\n"
617
+ continue
618
+
619
+ # Get non-missing values
620
+ values = self.data[col].dropna().values
621
+
622
+ if len(values) < 2:
623
+ result += f"Not enough non-missing values in {col} for trend analysis.\n"
624
+ continue
625
+
626
+ # Calculate basic trend
627
+ result += f"Trend for {col}:\n"
628
+
629
+ # Linear regression for trend
630
+ x = np.arange(len(values)).reshape(-1, 1)
631
+ y = values
632
+
633
+ model = LinearRegression()
634
+ model.fit(x, y)
635
+
636
+ slope = model.coef_[0]
637
+ avg_val = np.mean(y)
638
+ result += f" Trend slope: {slope:.4f} per unit\n"
639
+ result += f" Relative trend: {slope / avg_val * 100:.2f}% of mean per unit\n"
640
+
641
+ # Determine trend direction and strength
642
+ if abs(slope / avg_val) > 0.01:
643
+ direction = "increasing" if slope > 0 else "decreasing"
644
+ strength = "strongly" if abs(slope / avg_val) > 0.05 else "moderately"
645
+ result += f" The {col} is {strength} {direction} over the sequence.\n"
646
+ else:
647
+ result += f" The {col} shows little change over the sequence.\n"
648
+
649
+ # R-squared
650
+ y_pred = model.predict(x)
651
+ r2 = r2_score(y, y_pred)
652
+ result += f" R-squared: {r2:.4f}\n"
653
+
654
+ # Check for simple patterns
655
+ if len(values) >= 4:
656
+ result += self._check_seasonality(values)
657
+
658
+ result += "\n"
659
+
660
+ return result
661
+
662
+ except Exception as e:
663
+ return f"Error analyzing trends: {str(e)}"
664
+
665
+ def _check_seasonality(self, values):
666
+ """Helper function to check for seasonality in a time series"""
667
+ result = ""
668
+
669
+ # Compute autocorrelation
670
+ acf = []
671
+ mean = np.mean(values)
672
+ variance = np.var(values)
673
+
674
+ if variance == 0: # All values are the same
675
+ return " No seasonality detected (constant values).\n"
676
+
677
+ # Compute autocorrelation up to 1/3 of series length
678
+ max_lag = min(len(values) // 3, 20) # Max 20 lags
679
+
680
+ for lag in range(1, max_lag + 1):
681
+ numerator = 0
682
+ for i in range(len(values) - lag):
683
+ numerator += (values[i] - mean) * (values[i + lag] - mean)
684
+ acf.append(numerator / (len(values) - lag) / variance)
685
+
686
+ # Find potential seasonality by looking for peaks in autocorrelation
687
+ peaks = []
688
+ for i in range(1, len(acf) - 1):
689
+ if acf[i] > acf[i-1] and acf[i] > acf[i+1] and acf[i] > 0.2:
690
+ peaks.append((i+1, acf[i]))
691
+
692
+ if peaks:
693
+ # Sort by correlation strength
694
+ peaks.sort(key=lambda x: x[1], reverse=True)
695
+ result += " Potential seasonality detected with periods: "
696
+ result += ", ".join([f"{p[0]} (r={p[1]:.2f})" for p in peaks[:3]])
697
+ result += "\n"
698
+ else:
699
+ result += " No clear seasonality detected.\n"
700
+
701
+ return result
702
+
703
+ def detect_outliers(self, query):
704
+ """Detect outliers in the data"""
705
+ if self.data is None:
706
+ return "No data loaded. Please load data first."
707
+
708
+ # Extract columns from query
709
+ columns = self._extract_column_names(query)
710
+
711
+ # If no columns specified, use all numeric columns
712
+ if not columns:
713
+ columns = self.data.select_dtypes(include=['number']).columns.tolist()
714
+ if not columns:
715
+ return "No numeric columns found for outlier detection."
716
+
717
+ try:
718
+ result = "Outlier Detection Results:\n"
719
+
720
+ for col in columns:
721
+ if not pd.api.types.is_numeric_dtype(self.data[col]):
722
+ result += f"Skipping non-numeric column: {col}\n"
723
+ continue
724
+
725
+ # Drop missing values
726
+ col_data = self.data[col].dropna()
727
+
728
+ if len(col_data) < 5:
729
+ result += f"Not enough data in {col} for outlier detection.\n"
730
+ continue
731
+
732
+ result += f"\nColumn: {col}\n"
733
+
734
+ # Method 1: IQR method
735
+ Q1 = col_data.quantile(0.25)
736
+ Q3 = col_data.quantile(0.75)
737
+ IQR = Q3 - Q1
738
+ lower_bound = Q1 - 1.5 * IQR
739
+ upper_bound = Q3 + 1.5 * IQR
740
+
741
+ outliers_iqr = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
742
+
743
+ result += f" IQR Method: {len(outliers_iqr)} outliers found\n"
744
+ result += f" Lower bound: {lower_bound:.4f}, Upper bound: {upper_bound:.4f}\n"
745
+
746
+ if len(outliers_iqr) > 0:
747
+ result += f" Outlier range: {outliers_iqr.min():.4f} to {outliers_iqr.max():.4f}\n"
748
+ if len(outliers_iqr) <= 10:
749
+ result += f" Outlier values: {', '.join(map(str, outliers_iqr.tolist()))}\n"
750
+ else:
751
+ result += f" First 5 outliers: {', '.join(map(str, outliers_iqr.iloc[:5].tolist()))}\n"
752
+
753
+ # Method 2: Z-score method
754
+ z_scores = stats.zscore(col_data)
755
+ outliers_zscore = col_data[abs(z_scores) > 3]
756
+
757
+ result += f" Z-score Method (|z| > 3): {len(outliers_zscore)} outliers found\n"
758
+
759
+ if len(outliers_zscore) > 0:
760
+ result += f" Outlier range: {outliers_zscore.min():.4f} to {outliers_zscore.max():.4f}\n"
761
+ if len(outliers_zscore) <= 10:
762
+ result += f" Outlier values: {', '.join(map(str, outliers_zscore.tolist()))}\n"
763
+ else:
764
+ result += f" First 5 outliers: {', '.join(map(str, outliers_zscore.iloc[:5].tolist()))}\n"
765
+
766
+ # Compare methods
767
+ common_outliers = set(outliers_iqr.index).intersection(set(outliers_zscore.index))
768
+ result += f" {len(common_outliers)} outliers detected by both methods\n"
769
+
770
+ # Impact of outliers
771
+ mean_with_outliers = col_data.mean()
772
+ mean_without_outliers = col_data[~col_data.index.isin(outliers_iqr.index)].mean()
773
+
774
+ impact = abs((mean_without_outliers - mean_with_outliers) / mean_with_outliers * 100)
775
+ result += f" Impact on mean: {impact:.2f}% change if IQR outliers removed\n"
776
+
777
+ return result
778
+
779
+ except Exception as e:
780
+ return f"Error detecting outliers: {str(e)}"
781
+
782
+ def predictive_analysis(self, query):
783
+ """Perform simple predictive analysis"""
784
+ if self.data is None:
785
+ return "No data loaded. Please load data first."
786
+
787
+ # Extract target and features from query
788
+ columns = self._extract_column_names(query)
789
+
790
+ if len(columns) < 2:
791
+ return "Please specify at least two columns: one target and one or more features."
792
+
793
+ # Last column is target, rest are features
794
+ target_col = columns[-1]
795
+ feature_cols = columns[:-1]
796
+
797
+ try:
798
+ # Check if columns are numeric
799
+ for col in columns:
800
+ if not pd.api.types.is_numeric_dtype(self.data[col]):
801
+ return f"Column '{col}' is not numeric. Simple predictive analysis requires numeric data."
802
+
803
+ # Prepare data
804
+ X = self.data[feature_cols].dropna()
805
+ y = self.data.loc[X.index, target_col]
806
+
807
+ if len(X) < 10:
808
+ return "Not enough complete data rows for predictive analysis (need at least 10)."
809
+
810
+ # Split data
811
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
812
+
813
+ # Fit model
814
+ model = LinearRegression()
815
+ model.fit(X_train, y_train)
816
+
817
+ # Make predictions
818
+ y_train_pred = model.predict(X_train)
819
+ y_test_pred = model.predict(X_test)
820
+
821
+ # Calculate metrics
822
+ train_mse = mean_squared_error(y_train, y_train_pred)
823
+ test_mse = mean_squared_error(y_test, y_test_pred)
824
+ train_r2 = r2_score(y_train, y_train_pred)
825
+ test_r2 = r2_score(y_test, y_test_pred)
826
+
827
+ # Prepare results
828
+ result = f"Predictive Analysis: Predicting '{target_col}' using {', '.join(feature_cols)}\n\n"
829
+
830
+ result += "Model Information:\n"
831
+ result += f" Linear Regression with {len(feature_cols)} feature(s)\n"
832
+ result += f" Training data: {len(X_train)} rows\n"
833
+ result += f" Testing data: {len(X_test)} rows\n\n"
834
+
835
+ result += "Feature Importance:\n"
836
+ for i, feature in enumerate(feature_cols):
837
+ result += f" {feature}: coefficient = {model.coef_[i]:.4f}\n"
838
+ result += f" Intercept: {model.intercept_:.4f}\n\n"
839
+
840
+ result += "Model Equation:\n"
841
+ equation = f"{target_col} = {model.intercept_:.4f}"
842
+ for i, feature in enumerate(feature_cols):
843
+ coef = model.coef_[i]
844
+ sign = "+" if coef >= 0 else ""
845
+ equation += f" {sign} {coef:.4f} × {feature}"
846
+ result += f" {equation}\n\n"
847
+
848
+ result += "Model Performance:\n"
849
+ result += f" Training set:\n"
850
+ result += f" Mean Squared Error: {train_mse:.4f}\n"
851
+ result += f" R² Score: {train_r2:.4f}\n\n"
852
+ result += f" Test set:\n"
853
+ result += f" Mean Squared Error: {test_mse:.4f}\n"
854
+ result += f" R² Score: {test_r2:.4f}\n\n"
855
+
856
+ # Interpret the results
857
+ result += "Interpretation:\n"
858
+
859
+ # Interpret R² score
860
+ if test_r2 >= 0.7:
861
+ result += " The model explains a high proportion of the variance in the target variable.\n"
862
+ elif test_r2 >= 0.4:
863
+ result += " The model explains a moderate proportion of the variance in the target variable.\n"
864
+ else:
865
+ result += " The model explains only a small proportion of the variance in the target variable.\n"
866
+
867
+ # Check for overfitting
868
+ if train_r2 - test_r2 > 0.2:
869
+ result += " The model shows signs of overfitting (performs much better on training than test data).\n"
870
+
871
+ # Feature importance interpretation
872
+ most_important_feature = feature_cols[abs(model.coef_).argmax()]
873
+ result += f" The most influential feature is '{most_important_feature}'.\n"
874
+
875
+ # Sample prediction
876
+ row_sample = X_test.iloc[0]
877
+ prediction = model.predict([row_sample])[0]
878
+
879
+ result += "\nSample Prediction:\n"
880
+ result += " For the values:\n"
881
+ for feature in feature_cols:
882
+ result += f" {feature} = {row_sample[feature]}\n"
883
+ result += f" Predicted {target_col} = {prediction:.4f}\n"
884
+
885
+ return result
886
+
887
+ except Exception as e:
888
+ return f"Error performing predictive analysis: {str(e)}"
889
+
890
+ def hypothesis_testing(self, query):
891
+ """Perform hypothesis testing on the data"""
892
+ if self.data is None:
893
+ return "No data loaded. Please load data first."
894
+
895
+ # Extract columns from query
896
+ columns = self._extract_column_names(query)
897
+
898
+ if len(columns) == 0:
899
+ return "Please specify at least one column for hypothesis testing."
900
+
901
+ try:
902
+ result = "Hypothesis Testing Results:\n\n"
903
+
904
+ # Single column analysis (distribution tests)
905
+ if len(columns) == 1:
906
+ col = columns[0]
907
+
908
+ if not pd.api.types.is_numeric_dtype(self.data[col]):
909
+ return f"Column '{col}' is not numeric. Basic hypothesis testing requires numeric data."
910
+
911
+ data = self.data[col].dropna()
912
+
913
+ # Normality test
914
+ stat, p_value = stats.shapiro(data) if len(data) < 5000 else stats.normaltest(data)
915
+
916
+ result += f"Normality Test for '{col}':\n"
917
+ result += f" Test used: {'Shapiro-Wilk' if len(data) < 5000 else 'D\'Agostino\'s K²'}\n"
918
+ result += f" Statistic: {stat:.4f}\n"
919
+ result += f" p-value: {p_value:.4f}\n"
920
+ result += f" Interpretation: The data is {'not ' if p_value < 0.05 else ''}normally distributed (95% confidence).\n\n"
921
+
922
+ # Basic statistics
923
+ mean = data.mean()
924
+ median = data.median()
925
+ std_dev = data.std()
926
+
927
+ # One-sample t-test (against 0 or population mean)
928
+ population_mean = 0 # Default null hypothesis mean
929
+ t_stat, p_value = stats.ttest_1samp(data, population_mean)
930
+
931
+ result += f"One-sample t-test for '{col}':\n"
932
+ result += f" Null Hypothesis: The mean of '{col}' is equal to {population_mean}\n"
933
+ result += f" Alternative Hypothesis: The mean of '{col}' is not equal to {population_mean}\n"
934
+ result += f" t-statistic: {t_stat:.4f}\n"
935
+ result += f" p-value: {p_value:.4f}\n"
936
+ result += f" Sample Mean: {mean:.4f}\n"
937
+ result += f" Interpretation: {'Reject' if p_value < 0.05 else 'Fail to reject'} the null hypothesis (95% confidence).\n"
938
+ result += f" In other words: The mean is {'statistically different from' if p_value < 0.05 else 'not statistically different from'} {population_mean}.\n"
939
+
940
+ # Two-column analysis
941
+ elif len(columns) == 2:
942
+ col1, col2 = columns
943
+
944
+ if not pd.api.types.is_numeric_dtype(self.data[col1]) or not pd.api.types.is_numeric_dtype(self.data[col2]):
945
+ return f"Both columns must be numeric for this hypothesis test."
946
+
947
+ data1 = self.data[col1].dropna()
948
+ data2 = self.data[col2].dropna()
949
+
950
+ # Check if the columns are independent or paired
951
+ are_paired = len(data1) == len(data2) and (self.data[columns].count().min() / self.data[columns].count().max() > 0.9)
952
+ test_type = "paired" if are_paired else "independent"
953
+
954
+ result += f"Two-sample {'Paired' if are_paired else 'Independent'} t-test:\n"
955
+ result += f" Comparing '{col1}' and '{col2}'\n"
956
+ result += f" Null Hypothesis: The means of the two columns are equal\n"
957
+ result += f" Alternative Hypothesis: The means of the two columns are not equal\n\n"
958
+
959
+ if are_paired:
960
+ # Use paired t-test for related samples
961
+ # Make sure we have pairs of non-NaN values
962
+ valid_rows = self.data[columns].dropna()
963
+ t_stat, p_value = stats.ttest_rel(valid_rows[col1], valid_rows[col2])
964
+ else:
965
+ # Use independent t-test
966
+ t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=False) # Use Welch's t-test
967
+
968
+ result += f" t-statistic: {t_stat:.4f}\n"
969
+ result += f" p-value: {p_value:.4f}\n"
970
+ result += f" Mean of '{col1}': {data1.mean():.4f}\n"
971
+ result += f" Mean of '{col2}': {data2.mean():.4f}\n"
972
+ result += f" Difference in means: {data1.mean() - data2.mean():.4f}\n"
973
+ result += f" Interpretation: {'Reject' if p_value < 0.05 else 'Fail to reject'} the null hypothesis (95% confidence).\n"
974
+ result += f" In other words: The means are {'statistically different' if p_value < 0.05 else 'not statistically different'} from each other.\n"
975
+
976
+ # Categorical vs. numeric analysis
977
+ elif len(columns) == 2:
978
+ col1, col2 = columns
979
+
980
+ # Check if one is categorical and one is numeric
981
+ if (pd.api.types.is_numeric_dtype(self.data[col1]) and
982
+ not pd.api.types.is_numeric_dtype(self.data[col2])):
983
+ numeric_col, cat_col = col1, col2
984
+ elif (pd.api.types.is_numeric_dtype(self.data[col2]) and
985
+ not pd.api.types.is_numeric_dtype(self.data[col1])):
986
+ numeric_col, cat_col = col2, col1
987
+ else:
988
+ return "For ANOVA, one column should be categorical and one should be numeric."
989
+
990
+ # Perform one-way ANOVA
991
+ groups = []
992
+ labels = []
993
+
994
+ for category, group in self.data.groupby(cat_col):
995
+ if len(group[numeric_col].dropna()) > 0:
996
+ groups.append(group[numeric_col].dropna())
997
+ labels.append(str(category))
998
+
999
+ if len(groups) < 2:
1000
+ return "Not enough groups with data for ANOVA."
1001
+
1002
+ f_stat, p_value = stats.f_oneway(*groups)
1003
+
1004
+ result += "One-way ANOVA:\n"
1005
+ result += f" Comparing '{numeric_col}' across groups of '{cat_col}'\n"
1006
+ result += f" Null Hypothesis: The means of '{numeric_col}' are equal across all groups\n"
1007
+ result += f" Alternative Hypothesis: At least one group has a different mean\n\n"
1008
+ result += f" F-statistic: {f_stat:.4f}\n"
1009
+ result += f" p-value: {p_value:.4f}\n"
1010
+ result += f" Group means:\n"
1011
+
1012
+ for i, (label, group) in enumerate(zip(labels, groups)):
1013
+ result += f" {label}: {group.mean():.4f} (n={len(group)})\n"
1014
+
1015
+ result += f" Interpretation: {'Reject' if p_value < 0.05 else 'Fail to reject'} the null hypothesis (95% confidence).\n"
1016
+ result += f" In other words: There {'is' if p_value < 0.05 else 'is no'} statistically significant difference between groups.\n"
1017
+
1018
+ # Multiple column comparison
1019
+ else:
1020
+ result += "Correlation Analysis:\n"
1021
+ numeric_cols = [col for col in columns if pd.api.types.is_numeric_dtype(self.data[col])]
1022
+
1023
+ if len(numeric_cols) < 2:
1024
+ return "Need at least two numeric columns for correlation analysis."
1025
+
1026
+ corr_matrix = self.data[numeric_cols].corr()
1027
+
1028
+ result += " Pearson Correlation Matrix:\n"
1029
+ result += f"{corr_matrix.to_string()}\n\n"
1030
+
1031
+ result += " Significance Tests (p-values):\n"
1032
+ p_matrix = pd.DataFrame(index=corr_matrix.index, columns=corr_matrix.columns)
1033
+
1034
+ for i in range(len(numeric_cols)):
1035
+ for j in range(i+1, len(numeric_cols)):
1036
+ col_i, col_j = numeric_cols[i], numeric_cols[j]
1037
+ valid_data = self.data[[col_i, col_j]].dropna()
1038
+ _, p_value = stats.pearsonr(valid_data[col_i], valid_data[col_j])
1039
+ p_matrix.loc[col_i, col_j] = p_value
1040
+ p_matrix.loc[col_j, col_i] = p_value
1041
+
1042
+ result += f"{p_matrix.to_string()}\n\n"
1043
+
1044
+ result += " Significant Correlations (p < 0.05):\n"
1045
+ for i in range(len(numeric_cols)):
1046
+ for j in range(i+1, len(numeric_cols)):
1047
+ col_i, col_j = numeric_cols[i], numeric_cols[j]
1048
+ if p_matrix.loc[col_i, col_j] < 0.05:
1049
+ corr_val = corr_matrix.loc[col_i, col_j]
1050
+ p_val = p_matrix.loc[col_i, col_j]
1051
+ result += f" {col_i} vs {col_j}: r={corr_val:.4f}, p={p_val:.4f}\n"
1052
+
1053
+ return result
1054
+
1055
+ except Exception as e:
1056
+ return f"Error performing hypothesis testing: {str(e)}"
1057
+
1058
+ def generate_report(self, query):
1059
+ """Generate a comprehensive report on the data"""
1060
+ if self.data is None:
1061
+ return "No data loaded. Please load data first."
1062
+
1063
+ try:
1064
+ report = "# Data Analysis Report\n\n"
1065
+
1066
+ # 1. Dataset Overview
1067
+ report += "## 1. Dataset Overview\n\n"
1068
+ report += f"**Data Source:** {self.data_source}\n"
1069
+ report += f"**Number of Rows:** {len(self.data)}\n"
1070
+ report += f"**Number of Columns:** {len(self.data.columns)}\n\n"
1071
+
1072
+ # Column types summary
1073
+ dtype_counts = {}
1074
+ for dtype in self.data.dtypes:
1075
+ dtype_name = str(dtype)
1076
+ if dtype_name in dtype_counts:
1077
+ dtype_counts[dtype_name] += 1
1078
+ else:
1079
+ dtype_counts[dtype_name] = 1
1080
+
1081
+ report += "**Column Data Types:**\n"
1082
+ for dtype, count in dtype_counts.items():
1083
+ report += f"- {dtype}: {count} columns\n"
1084
+ report += "\n"
1085
+
1086
+ # 2. Data Quality Assessment
1087
+ report += "## 2. Data Quality Assessment\n\n"
1088
+
1089
+ # Missing values
1090
+ missing_values = self.data.isnull().sum()
1091
+ missing_percentage = (missing_values / len(self.data) * 100).round(2)
1092
+
1093
+ missing_cols = missing_values[missing_values > 0]
1094
+ if len(missing_cols) > 0:
1095
+ report += "**Missing Values:**\n"
1096
+ for col, count in missing_cols.items():
1097
+ report += f"- {col}: {count} missing values ({missing_percentage[col]}%)\n"
1098
+ else:
1099
+ report += "**Missing Values:** None\n"
1100
+
1101
+ report += "\n"
1102
+
1103
+ # 3. Descriptive Statistics
1104
+ report += "## 3. Descriptive Statistics\n\n"
1105
+
1106
+ # Numeric columns
1107
+ numeric_cols = self.data.select_dtypes(include=['number']).columns.tolist()
1108
+ if numeric_cols:
1109
+ report += "**Numeric Columns:**\n"
1110
+ report += "```\n"
1111
+ report += self.data[numeric_cols].describe().to_string()
1112
+ report += "\n```\n\n"
1113
+
1114
+ # Categorical columns
1115
+ cat_cols = self.data.select_dtypes(exclude=['number']).columns.tolist()
1116
+ if cat_cols:
1117
+ report += "**Categorical Columns:**\n"
1118
+ for col in cat_cols[:5]: # Limit to first 5 for brevity
1119
+ value_counts = self.data[col].value_counts().head(5)
1120
+ report += f"Top values for '{col}':\n"
1121
+ report += "```\n"
1122
+ report += value_counts.to_string()
1123
+ report += "\n```\n"
1124
+ report += f"Unique values: {self.data[col].nunique()}\n\n"
1125
+
1126
+ if len(cat_cols) > 5:
1127
+ report += f"(Analysis limited to first 5 out of {len(cat_cols)} categorical columns)\n\n"
1128
+
1129
+ # 4. Correlation Analysis
1130
+ report += "## 4. Correlation Analysis\n\n"
1131
+
1132
+ if len(numeric_cols) >= 2:
1133
+ corr_matrix = self.data[numeric_cols].corr()
1134
+
1135
+ report += "**Correlation Matrix:**\n"
1136
+ report += "```\n"
1137
+ report += corr_matrix.round(2).to_string()
1138
+ report += "\n```\n\n"
1139
+
1140
+ # Strongest correlations
1141
+ corr_pairs = []
1142
+ for i in range(len(numeric_cols)):
1143
+ for j in range(i+1, len(numeric_cols)):
1144
+ col1, col2 = numeric_cols[i], numeric_cols[j]
1145
+ corr_val = corr_matrix.loc[col1, col2]
1146
+ if abs(corr_val) > 0.5: # Only report moderate to strong correlations
1147
+ corr_pairs.append((col1, col2, corr_val))
1148
+
1149
+ if corr_pairs:
1150
+ # Sort by absolute correlation value
1151
+ corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
1152
+
1153
+ report += "**Strongest Correlations:**\n"
1154
+ for col1, col2, corr_val in corr_pairs[:10]: # Top 10
1155
+ direction = "positive" if corr_val > 0 else "negative"
1156
+ report += f"- {col1} vs {col2}: {corr_val:.4f} ({direction})\n"
1157
+ report += "\n"
1158
+ else:
1159
+ report += "No moderate or strong correlations (|r| > 0.5) found between variables.\n\n"
1160
+ else:
1161
+ report += "Insufficient numeric columns for correlation analysis.\n\n"
1162
+
1163
+ # 5. Key Insights
1164
+ report += "## 5. Key Insights\n\n"
1165
+
1166
+ insights = []
1167
+
1168
+ # Data quality insights
1169
+ total_missing = missing_values.sum()
1170
+ if total_missing > 0:
1171
+ total_cells = self.data.size
1172
+ overall_percentage = (total_missing / total_cells * 100).round(2)
1173
+ if overall_percentage > 10:
1174
+ insights.append(f"The dataset has a high proportion of missing values ({overall_percentage}% overall), which may require imputation or handling.")
1175
+
1176
+ # Distribution insights for numeric columns
1177
+ for col in numeric_cols[:5]: # Limit to first 5 for brevity
1178
+ col_data = self.data[col].dropna()
1179
+
1180
+ if len(col_data) == 0:
1181
+ continue
1182
+
1183
+ mean = col_data.mean()
1184
+ median = col_data.median()
1185
+ skew = col_data.skew()
1186
+
1187
+ # Check for skewed distributions
1188
+ if abs(skew) > 1:
1189
+ skew_direction = "positively" if skew > 0 else "negatively"
1190
+ insights.append(f"'{col}' is {skew_direction} skewed (skew={skew:.2f}), with mean={mean:.2f} and median={median:.2f}.")
1191
+
1192
+ # Check for outliers
1193
+ Q1 = col_data.quantile(0.25)
1194
+ Q3 = col_data.quantile(0.75)
1195
+ IQR = Q3 - Q1
1196
+ lower_bound = Q1 - 1.5 * IQR
1197
+ upper_bound = Q3 + 1.5 * IQR
1198
+
1199
+ outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
1200
+ outlier_percentage = (len(outliers) / len(col_data) * 100).round(2)
1201
+
1202
+ if outlier_percentage > 5:
1203
+ insights.append(f"'{col}' has a high proportion of outliers ({outlier_percentage}% of values).")
1204
+
1205
+ # Correlation insights
1206
+ if len(corr_pairs) > 0:
1207
+ top_corr = corr_pairs[0]
1208
+ direction = "positively" if top_corr[2] > 0 else "negatively"
1209
+ insights.append(f"The strongest relationship is between '{top_corr[0]}' and '{top_corr[1]}' (r={top_corr[2]:.2f}), which are {direction} correlated.")
1210
+
1211
+ # Report insights
1212
+ if insights:
1213
+ for i, insight in enumerate(insights, 1):
1214
+ report += f"{i}. {insight}\n"
1215
+ else:
1216
+ report += "No significant insights detected based on initial analysis.\n"
1217
+
1218
+ report += "\n"
1219
+
1220
+ # 6. Next Steps
1221
+ report += "## 6. Recommendations for Further Analysis\n\n"
1222
+ recommendations = [
1223
+ "Conduct more detailed analysis on columns with high missing value rates.",
1224
+ "For skewed numeric distributions, consider transformations (e.g., log, sqrt) before analysis.",
1225
+ "Investigate outliers to determine if they represent valid data points or errors.",
1226
+ "For strongly correlated variables, explore causality or consider dimensionality reduction.",
1227
+ "Consider predictive modeling using the identified relationships."
1228
+ ]
1229
+
1230
+ for i, rec in enumerate(recommendations, 1):
1231
+ report += f"{i}. {rec}\n"
1232
+
1233
+ # Save the report to a file
1234
+ report_filename = f"data_analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
1235
+ with open(report_filename, "w") as f:
1236
+ f.write(report)
1237
+
1238
+ return f"Report generated and saved as {report_filename}"
1239
+
1240
+ except Exception as e:
1241
+ return f"Error generating report: {str(e)}"
1242
+
1243
+ def get_help(self, query):
1244
+ """Display help information about available commands"""
1245
+ help_text = "Available Commands:\n\n"
1246
+
1247
+ help_text += "DATA LOADING AND INSPECTION\n"
1248
+ help_text += " load csv <path> - Load data from a CSV file\n"
1249
+ help_text += " load excel <path> - Load data from an Excel file\n"
1250
+ help_text += " load json <path> - Load data from a JSON file\n"
1251
+ help_text += " load sql <db_path> query <sql> - Load data from a SQL database\n"
1252
+ help_text += " info - Get basic information about the loaded data\n"
1253
+ help_text += " describe [column1 column2...] - Get descriptive statistics\n"
1254
+ help_text += " missing - Check for missing values in the data\n"
1255
+ help_text += "\n"
1256
+
1257
+ help_text += "DATA ANALYSIS\n"
1258
+ help_text += " analyze <column> - Analyze a specific column\n"
1259
+ help_text += " correlate [column1 column2...] - Analyze correlations between columns\n"
1260
+ help_text += " trend <column1 column2...> - Analyze trends over time or sequence\n"
1261
+ help_text += " outliers [column1 column2...] - Detect outliers in the data\n"
1262
+ help_text += " test <column1> [column2] - Perform hypothesis testing\n"
1263
+ help_text += "\n"
1264
+
1265
+ help_text += "VISUALIZATION AND REPORTING\n"
1266
+ help_text += " visualize <type> <column1 column2...> - Generate visualizations\n"
1267
+ help_text += " Visualization types: scatter, histogram, box, bar, pie, heatmap, line\n"
1268
+ help_text += " report - Generate a comprehensive report on the data\n"
1269
+ help_text += "\n"
1270
+
1271
+ help_text += "EXAMPLES:\n"
1272
+ help_text += " load csv data.csv\n"
1273
+ help_text += " analyze temperature\n"
1274
+ help_text += " correlate temperature humidity pressure\n"
1275
+ help_text += " visualize scatter temperature humidity\n"
1276
+ help_text += " trend sales date\n"
1277
+
1278
+ return help_text
1279
+
1280
+ # Example usage
1281
+ if __name__ == "__main__":
1282
+ print("Data Analysis Chatbot initialized. Type 'help' for available commands.")
1283
+ chatbot = DataAnalysisChatbot()
1284
+
1285
+ while True:
1286
+ user_input = input("\nEnter your query (or 'exit' to quit): ")
1287
+
1288
+ if user_input.lower() in ['exit', 'quit']:
1289
+ print("Exiting chatbot. Goodbye!")
1290
+ break
1291
+
1292
+ response = chatbot.process_query(user_input)
1293
+ print("\nResponse:", response)