Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ class DataAnalyzer:
|
|
22 |
if df is None:
|
23 |
return "Please upload a dataset first"
|
24 |
|
|
|
25 |
report = sv.analyze(df)
|
26 |
report_path = os.path.join(self.temp_dir, "report.html")
|
27 |
report.show_html(report_path, open_browser=False)
|
@@ -43,7 +44,7 @@ class DataAnalyzer:
|
|
43 |
|
44 |
os.remove(report_path)
|
45 |
return html_with_table
|
46 |
-
|
47 |
def generate_autoviz_report(self, df):
|
48 |
if df is None:
|
49 |
return "Please upload a dataset first"
|
@@ -52,34 +53,69 @@ class DataAnalyzer:
|
|
52 |
if os.path.exists(viz_temp_dir):
|
53 |
shutil.rmtree(viz_temp_dir)
|
54 |
os.makedirs(viz_temp_dir)
|
55 |
-
|
56 |
try:
|
57 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if len(df) > 5000:
|
59 |
df = df.sample(n=5000, random_state=42)
|
60 |
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
|
|
|
|
|
64 |
dfte = self.AV.AutoViz(
|
65 |
filename='',
|
66 |
sep=',',
|
67 |
-
depVar='',
|
68 |
-
dfte=df,
|
69 |
header=0,
|
70 |
-
verbose=1,
|
71 |
lowess=False,
|
72 |
-
chart_format='
|
73 |
max_rows_analyzed=5000,
|
74 |
max_cols_analyzed=30,
|
75 |
save_plot_dir=viz_temp_dir
|
76 |
)
|
77 |
-
|
78 |
-
# Collect
|
79 |
html_parts = []
|
80 |
if os.path.exists(viz_temp_dir):
|
81 |
for file in sorted(os.listdir(viz_temp_dir)):
|
82 |
-
if file.endswith('.html'):
|
83 |
file_path = os.path.join(viz_temp_dir, file)
|
84 |
try:
|
85 |
with open(file_path, 'r', encoding='utf-8') as f:
|
@@ -88,45 +124,57 @@ class DataAnalyzer:
|
|
88 |
html_parts.append(content)
|
89 |
except Exception as e:
|
90 |
print(f"Error reading file {file}: {str(e)}")
|
91 |
-
|
92 |
if not html_parts:
|
93 |
-
return """
|
94 |
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
<h3>No visualizations were generated</h3>
|
96 |
<p>This might be due to:</p>
|
97 |
<ul>
|
98 |
-
<li>
|
99 |
-
<li>
|
100 |
-
<li>
|
101 |
</ul>
|
102 |
-
<p>Try with a different dataset or check your data formatting.</p>
|
103 |
</div>
|
104 |
"""
|
105 |
-
|
106 |
-
# Combine all HTML content with proper styling
|
107 |
combined_html = f"""
|
108 |
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
109 |
<h2 style="text-align: center;">AutoViz Analysis Report</h2>
|
110 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
<hr>
|
112 |
{'<hr>'.join(html_parts)}
|
113 |
</div>
|
114 |
"""
|
115 |
|
116 |
return combined_html
|
117 |
-
|
118 |
except Exception as e:
|
|
|
119 |
error_message = f"""
|
120 |
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
|
121 |
<h3>Error in AutoViz Analysis</h3>
|
122 |
<p>Error details: {str(e)}</p>
|
123 |
-
<p>
|
124 |
-
<
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
130 |
</div>
|
131 |
"""
|
132 |
return error_message
|
@@ -134,7 +182,6 @@ class DataAnalyzer:
|
|
134 |
if os.path.exists(viz_temp_dir):
|
135 |
shutil.rmtree(viz_temp_dir)
|
136 |
|
137 |
-
|
138 |
def create_interface():
|
139 |
analyzer = DataAnalyzer()
|
140 |
|
|
|
22 |
if df is None:
|
23 |
return "Please upload a dataset first"
|
24 |
|
25 |
+
self.df = df
|
26 |
report = sv.analyze(df)
|
27 |
report_path = os.path.join(self.temp_dir, "report.html")
|
28 |
report.show_html(report_path, open_browser=False)
|
|
|
44 |
|
45 |
os.remove(report_path)
|
46 |
return html_with_table
|
47 |
+
|
48 |
def generate_autoviz_report(self, df):
|
49 |
if df is None:
|
50 |
return "Please upload a dataset first"
|
|
|
53 |
if os.path.exists(viz_temp_dir):
|
54 |
shutil.rmtree(viz_temp_dir)
|
55 |
os.makedirs(viz_temp_dir)
|
56 |
+
|
57 |
try:
|
58 |
+
# Data preprocessing
|
59 |
+
df = df.copy()
|
60 |
+
|
61 |
+
# Handle datetime columns
|
62 |
+
for col in df.columns:
|
63 |
+
try:
|
64 |
+
df[col] = pd.to_datetime(df[col], errors='ignore')
|
65 |
+
except:
|
66 |
+
pass
|
67 |
+
|
68 |
+
datetime_columns = df.select_dtypes(include=['datetime64']).columns
|
69 |
+
for col in datetime_columns:
|
70 |
+
df[f'{col}_year'] = df[col].dt.year
|
71 |
+
df[f'{col}_month'] = df[col].dt.month
|
72 |
+
df = df.drop(columns=[col])
|
73 |
+
|
74 |
+
# Try to convert string columns to numeric where possible
|
75 |
+
for col in df.select_dtypes(include=['object']).columns:
|
76 |
+
try:
|
77 |
+
df[col] = pd.to_numeric(df[col], errors='ignore')
|
78 |
+
except:
|
79 |
+
pass
|
80 |
+
|
81 |
+
# Convert remaining string columns to categorical if cardinality is low
|
82 |
+
object_columns = df.select_dtypes(include=['object']).columns
|
83 |
+
for col in object_columns:
|
84 |
+
if df[col].nunique() < 50:
|
85 |
+
df[col] = df[col].astype('category')
|
86 |
+
|
87 |
+
# Sample data if needed
|
88 |
if len(df) > 5000:
|
89 |
df = df.sample(n=5000, random_state=42)
|
90 |
|
91 |
+
# Print data info for debugging
|
92 |
+
print("\nDataset Info:")
|
93 |
+
print(df.info())
|
94 |
+
print("\nColumn Types:")
|
95 |
+
print(df.dtypes)
|
96 |
|
97 |
+
plt.close('all')
|
98 |
+
|
99 |
+
# Run AutoViz
|
100 |
dfte = self.AV.AutoViz(
|
101 |
filename='',
|
102 |
sep=',',
|
103 |
+
depVar='',
|
104 |
+
dfte=df,
|
105 |
header=0,
|
106 |
+
verbose=1,
|
107 |
lowess=False,
|
108 |
+
chart_format='svg',
|
109 |
max_rows_analyzed=5000,
|
110 |
max_cols_analyzed=30,
|
111 |
save_plot_dir=viz_temp_dir
|
112 |
)
|
113 |
+
|
114 |
+
# Collect visualizations
|
115 |
html_parts = []
|
116 |
if os.path.exists(viz_temp_dir):
|
117 |
for file in sorted(os.listdir(viz_temp_dir)):
|
118 |
+
if file.endswith('.html') or file.endswith('.svg'):
|
119 |
file_path = os.path.join(viz_temp_dir, file)
|
120 |
try:
|
121 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
124 |
html_parts.append(content)
|
125 |
except Exception as e:
|
126 |
print(f"Error reading file {file}: {str(e)}")
|
127 |
+
|
128 |
if not html_parts:
|
129 |
+
return f"""
|
130 |
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
131 |
+
<h3>Data Summary</h3>
|
132 |
+
<p>Total Rows: {len(df)}</p>
|
133 |
+
<p>Total Columns: {len(df.columns)}</p>
|
134 |
+
<p>Column Types:</p>
|
135 |
+
<pre>{df.dtypes.to_string()}</pre>
|
136 |
+
<hr>
|
137 |
<h3>No visualizations were generated</h3>
|
138 |
<p>This might be due to:</p>
|
139 |
<ul>
|
140 |
+
<li>All columns being categorical with high cardinality</li>
|
141 |
+
<li>No numeric columns for analysis</li>
|
142 |
+
<li>Data format not suitable for visualization</li>
|
143 |
</ul>
|
|
|
144 |
</div>
|
145 |
"""
|
146 |
+
|
|
|
147 |
combined_html = f"""
|
148 |
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
149 |
<h2 style="text-align: center;">AutoViz Analysis Report</h2>
|
150 |
+
<div style="margin: 20px;">
|
151 |
+
<h3>Dataset Summary</h3>
|
152 |
+
<p>Rows analyzed: {len(df)}</p>
|
153 |
+
<p>Columns: {len(df.columns)}</p>
|
154 |
+
<p>Column Types:</p>
|
155 |
+
<pre>{df.dtypes.to_string()}</pre>
|
156 |
+
</div>
|
157 |
<hr>
|
158 |
{'<hr>'.join(html_parts)}
|
159 |
</div>
|
160 |
"""
|
161 |
|
162 |
return combined_html
|
163 |
+
|
164 |
except Exception as e:
|
165 |
+
import traceback
|
166 |
error_message = f"""
|
167 |
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
|
168 |
<h3>Error in AutoViz Analysis</h3>
|
169 |
<p>Error details: {str(e)}</p>
|
170 |
+
<p>Stack trace:</p>
|
171 |
+
<pre>{traceback.format_exc()}</pre>
|
172 |
+
<p>Dataset Info:</p>
|
173 |
+
<pre>
|
174 |
+
Rows: {len(df)}
|
175 |
+
Columns: {len(df.columns)}
|
176 |
+
Types:\n{df.dtypes.to_string()}
|
177 |
+
</pre>
|
178 |
</div>
|
179 |
"""
|
180 |
return error_message
|
|
|
182 |
if os.path.exists(viz_temp_dir):
|
183 |
shutil.rmtree(viz_temp_dir)
|
184 |
|
|
|
185 |
def create_interface():
|
186 |
analyzer = DataAnalyzer()
|
187 |
|