baconnier commited on
Commit
b2f41cc
·
verified ·
1 Parent(s): 8e3aecb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -30
app.py CHANGED
@@ -22,6 +22,7 @@ class DataAnalyzer:
22
  if df is None:
23
  return "Please upload a dataset first"
24
 
 
25
  report = sv.analyze(df)
26
  report_path = os.path.join(self.temp_dir, "report.html")
27
  report.show_html(report_path, open_browser=False)
@@ -43,7 +44,7 @@ class DataAnalyzer:
43
 
44
  os.remove(report_path)
45
  return html_with_table
46
-
47
  def generate_autoviz_report(self, df):
48
  if df is None:
49
  return "Please upload a dataset first"
@@ -52,34 +53,69 @@ class DataAnalyzer:
52
  if os.path.exists(viz_temp_dir):
53
  shutil.rmtree(viz_temp_dir)
54
  os.makedirs(viz_temp_dir)
55
-
56
  try:
57
- # Sample data if it's too large
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if len(df) > 5000:
59
  df = df.sample(n=5000, random_state=42)
60
 
61
- plt.close('all') # Close any existing plots
 
 
 
 
62
 
63
- # Configure AutoViz with correct parameters
 
 
64
  dfte = self.AV.AutoViz(
65
  filename='',
66
  sep=',',
67
- depVar='', # No target variable
68
- dfte=df, # Pass the dataframe directly
69
  header=0,
70
- verbose=1, # Set to 1 to see progress
71
  lowess=False,
72
- chart_format='html',
73
  max_rows_analyzed=5000,
74
  max_cols_analyzed=30,
75
  save_plot_dir=viz_temp_dir
76
  )
77
-
78
- # Collect and combine HTML files
79
  html_parts = []
80
  if os.path.exists(viz_temp_dir):
81
  for file in sorted(os.listdir(viz_temp_dir)):
82
- if file.endswith('.html'):
83
  file_path = os.path.join(viz_temp_dir, file)
84
  try:
85
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -88,45 +124,57 @@ class DataAnalyzer:
88
  html_parts.append(content)
89
  except Exception as e:
90
  print(f"Error reading file {file}: {str(e)}")
91
-
92
  if not html_parts:
93
- return """
94
  <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
 
 
 
 
 
 
95
  <h3>No visualizations were generated</h3>
96
  <p>This might be due to:</p>
97
  <ul>
98
- <li>Data format issues</li>
99
- <li>Too few unique values in columns</li>
100
- <li>All categorical data with high cardinality</li>
101
  </ul>
102
- <p>Try with a different dataset or check your data formatting.</p>
103
  </div>
104
  """
105
-
106
- # Combine all HTML content with proper styling
107
  combined_html = f"""
108
  <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
109
  <h2 style="text-align: center;">AutoViz Analysis Report</h2>
110
- <p style="text-align: center;">Analysis of {len(df)} rows and {len(df.columns)} columns</p>
 
 
 
 
 
 
111
  <hr>
112
  {'<hr>'.join(html_parts)}
113
  </div>
114
  """
115
 
116
  return combined_html
117
-
118
  except Exception as e:
 
119
  error_message = f"""
120
  <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
121
  <h3>Error in AutoViz Analysis</h3>
122
  <p>Error details: {str(e)}</p>
123
- <p>Troubleshooting steps:</p>
124
- <ul>
125
- <li>Check if your data contains valid numerical or categorical values</li>
126
- <li>Ensure there are no completely empty columns</li>
127
- <li>Try with a smaller dataset</li>
128
- <li>Check for any special characters in column names</li>
129
- </ul>
 
130
  </div>
131
  """
132
  return error_message
@@ -134,7 +182,6 @@ class DataAnalyzer:
134
  if os.path.exists(viz_temp_dir):
135
  shutil.rmtree(viz_temp_dir)
136
 
137
-
138
  def create_interface():
139
  analyzer = DataAnalyzer()
140
 
 
22
  if df is None:
23
  return "Please upload a dataset first"
24
 
25
+ self.df = df
26
  report = sv.analyze(df)
27
  report_path = os.path.join(self.temp_dir, "report.html")
28
  report.show_html(report_path, open_browser=False)
 
44
 
45
  os.remove(report_path)
46
  return html_with_table
47
+
48
  def generate_autoviz_report(self, df):
49
  if df is None:
50
  return "Please upload a dataset first"
 
53
  if os.path.exists(viz_temp_dir):
54
  shutil.rmtree(viz_temp_dir)
55
  os.makedirs(viz_temp_dir)
56
+
57
  try:
58
+ # Data preprocessing
59
+ df = df.copy()
60
+
61
+ # Handle datetime columns
62
+ for col in df.columns:
63
+ try:
64
+ df[col] = pd.to_datetime(df[col], errors='ignore')
65
+ except:
66
+ pass
67
+
68
+ datetime_columns = df.select_dtypes(include=['datetime64']).columns
69
+ for col in datetime_columns:
70
+ df[f'{col}_year'] = df[col].dt.year
71
+ df[f'{col}_month'] = df[col].dt.month
72
+ df = df.drop(columns=[col])
73
+
74
+ # Try to convert string columns to numeric where possible
75
+ for col in df.select_dtypes(include=['object']).columns:
76
+ try:
77
+ df[col] = pd.to_numeric(df[col], errors='ignore')
78
+ except:
79
+ pass
80
+
81
+ # Convert remaining string columns to categorical if cardinality is low
82
+ object_columns = df.select_dtypes(include=['object']).columns
83
+ for col in object_columns:
84
+ if df[col].nunique() < 50:
85
+ df[col] = df[col].astype('category')
86
+
87
+ # Sample data if needed
88
  if len(df) > 5000:
89
  df = df.sample(n=5000, random_state=42)
90
 
91
+ # Print data info for debugging
92
+ print("\nDataset Info:")
93
+ print(df.info())
94
+ print("\nColumn Types:")
95
+ print(df.dtypes)
96
 
97
+ plt.close('all')
98
+
99
+ # Run AutoViz
100
  dfte = self.AV.AutoViz(
101
  filename='',
102
  sep=',',
103
+ depVar='',
104
+ dfte=df,
105
  header=0,
106
+ verbose=1,
107
  lowess=False,
108
+ chart_format='svg',
109
  max_rows_analyzed=5000,
110
  max_cols_analyzed=30,
111
  save_plot_dir=viz_temp_dir
112
  )
113
+
114
+ # Collect visualizations
115
  html_parts = []
116
  if os.path.exists(viz_temp_dir):
117
  for file in sorted(os.listdir(viz_temp_dir)):
118
+ if file.endswith('.html') or file.endswith('.svg'):
119
  file_path = os.path.join(viz_temp_dir, file)
120
  try:
121
  with open(file_path, 'r', encoding='utf-8') as f:
 
124
  html_parts.append(content)
125
  except Exception as e:
126
  print(f"Error reading file {file}: {str(e)}")
127
+
128
  if not html_parts:
129
+ return f"""
130
  <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
131
+ <h3>Data Summary</h3>
132
+ <p>Total Rows: {len(df)}</p>
133
+ <p>Total Columns: {len(df.columns)}</p>
134
+ <p>Column Types:</p>
135
+ <pre>{df.dtypes.to_string()}</pre>
136
+ <hr>
137
  <h3>No visualizations were generated</h3>
138
  <p>This might be due to:</p>
139
  <ul>
140
+ <li>All columns being categorical with high cardinality</li>
141
+ <li>No numeric columns for analysis</li>
142
+ <li>Data format not suitable for visualization</li>
143
  </ul>
 
144
  </div>
145
  """
146
+
 
147
  combined_html = f"""
148
  <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
149
  <h2 style="text-align: center;">AutoViz Analysis Report</h2>
150
+ <div style="margin: 20px;">
151
+ <h3>Dataset Summary</h3>
152
+ <p>Rows analyzed: {len(df)}</p>
153
+ <p>Columns: {len(df.columns)}</p>
154
+ <p>Column Types:</p>
155
+ <pre>{df.dtypes.to_string()}</pre>
156
+ </div>
157
  <hr>
158
  {'<hr>'.join(html_parts)}
159
  </div>
160
  """
161
 
162
  return combined_html
163
+
164
  except Exception as e:
165
+ import traceback
166
  error_message = f"""
167
  <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
168
  <h3>Error in AutoViz Analysis</h3>
169
  <p>Error details: {str(e)}</p>
170
+ <p>Stack trace:</p>
171
+ <pre>{traceback.format_exc()}</pre>
172
+ <p>Dataset Info:</p>
173
+ <pre>
174
+ Rows: {len(df)}
175
+ Columns: {len(df.columns)}
176
+ Types:\n{df.dtypes.to_string()}
177
+ </pre>
178
  </div>
179
  """
180
  return error_message
 
182
  if os.path.exists(viz_temp_dir):
183
  shutil.rmtree(viz_temp_dir)
184
 
 
185
  def create_interface():
186
  analyzer = DataAnalyzer()
187