Spaces:
Running
Running
File size: 13,399 Bytes
0cb60c7 67f471c 771365f 947739b 7617875 f421e23 96823ba 7617875 0cb60c7 c9d2489 67f471c 771365f 947739b 96823ba 771365f 96823ba 8d8e69e cb5dc7e b2f41cc 8d8e69e b2f41cc 947739b cb5dc7e 947739b 8c15039 96823ba b2f41cc 8c15039 179691f 96823ba f421e23 8c15039 f421e23 96823ba f421e23 96823ba f421e23 96823ba f421e23 96823ba f421e23 b2f41cc f421e23 179691f 96823ba f421e23 b2f41cc 947739b b2f41cc 179691f 96823ba 179691f b2f41cc 179691f 0cb60c7 f421e23 0cb60c7 cb5dc7e 8d8e69e 276ed24 cb5dc7e 7617875 cb5dc7e f421e23 8c15039 f421e23 cb5dc7e f421e23 cb5dc7e f421e23 cb5dc7e f421e23 cb5dc7e f421e23 cb5dc7e 7617875 cb5dc7e f421e23 8c15039 f421e23 cb5dc7e f421e23 cb5dc7e 8d8e69e 0cb60c7 96823ba cb5dc7e f421e23 96823ba f421e23 96823ba f421e23 8c15039 96823ba f421e23 cb5dc7e 96823ba cb5dc7e f421e23 96823ba 7617875 96823ba cb5dc7e 96823ba 947739b 7617875 0cb60c7 f421e23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 |
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from autoviz.AutoViz_Class import AutoViz_Class
import shutil
import warnings
import io
import base64
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
warnings.filterwarnings('ignore')
class DataAnalyzer:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.df = None
self.AV = AutoViz_Class()
self.plots_memory = {} # Store plots in memory
def save_plot_to_memory(self, fig, plot_name):
"""Save matplotlib figure to memory as base64"""
buf = io.BytesIO()
fig.savefig(buf, format='png', bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.getvalue()).decode()
self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}'
plt.close(fig)
def generate_basic_plots(self, df):
"""Generate basic matplotlib plots"""
# Numeric columns distribution
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
fig, ax = plt.subplots(figsize=(10, 6))
df[col].hist(bins=30, ax=ax)
ax.set_title(f'Distribution of {col}')
self.save_plot_to_memory(fig, f'dist_{col}')
# Box plot
fig, ax = plt.subplots(figsize=(10, 6))
df.boxplot(column=col, ax=ax)
ax.set_title(f'Box Plot of {col}')
self.save_plot_to_memory(fig, f'box_{col}')
# Categorical columns
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
for col in categorical_cols:
if df[col].nunique() < 20: # Only for columns with reasonable number of categories
fig, ax = plt.subplots(figsize=(12, 6))
df[col].value_counts().plot(kind='bar', ax=ax)
ax.set_title(f'Distribution of {col}')
plt.xticks(rotation=45)
self.save_plot_to_memory(fig, f'cat_{col}')
# Correlation matrix for numeric columns
if len(numeric_cols) > 1:
fig, ax = plt.subplots(figsize=(10, 8))
correlation_matrix = df[numeric_cols].corr()
im = ax.imshow(correlation_matrix)
ax.set_xticks(range(len(numeric_cols)))
ax.set_yticks(range(len(numeric_cols)))
ax.set_xticklabels(numeric_cols, rotation=45)
ax.set_yticklabels(numeric_cols)
plt.colorbar(im)
ax.set_title('Correlation Matrix')
self.save_plot_to_memory(fig, 'correlation_matrix')
def generate_sweetviz_report(self, df):
if df is None:
return "Please upload a dataset first"
self.df = df
report = sv.analyze(df)
report_path = os.path.join(self.temp_dir, "report.html")
report.show_html(report_path, open_browser=False)
with open(report_path, 'r', encoding='utf-8') as f:
html_content = f.read()
html_with_table = f"""
<table width="100%" style="border-collapse: collapse;">
<tr>
<td style="padding: 20px; border: 1px solid #ddd;">
<div style="height: 800px; overflow: auto;">
{html_content}
</div>
</td>
</tr>
</table>
"""
os.remove(report_path)
return html_with_table
def generate_autoviz_report(self, df):
if df is None:
return "Please upload a dataset first"
try:
# Preprocess the dataframe
df = df.copy()
# Convert 'value' column to numeric if possible
if 'value' in df.columns:
df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
# Sample if needed
if len(df) > 5000:
df = df.sample(n=5000, random_state=42)
# Generate basic plots
self.generate_basic_plots(df)
# Generate summary statistics
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
numeric_stats = df[numeric_cols].describe().round(2) if len(numeric_cols) > 0 else pd.DataFrame()
categorical_stats = df[categorical_cols].describe() if len(categorical_cols) > 0 else pd.DataFrame()
# Create HTML content with styling
html_content = """
<style>
.table {
width: 100%;
margin-bottom: 1rem;
color: #212529;
border-collapse: collapse;
}
.table-striped tbody tr:nth-of-type(odd) {
background-color: rgba(0,0,0,.05);
}
.table td, .table th {
padding: .75rem;
border: 1px solid #dee2e6;
}
.table th {
background-color: #f8f9fa;
}
.plot-container {
margin: 20px 0;
padding: 10px;
border: 1px solid #ddd;
border-radius: 5px;
}
.plot-container img {
max-width: 100%;
height: auto;
}
</style>
"""
# Add summary statistics
html_content += f"""
<div class="viz-container">
<h2 style="text-align: center;">Data Analysis Report</h2>
<div style="margin: 20px;">
<h3>Dataset Overview</h3>
<p>Total Rows: {len(df)}</p>
<p>Total Columns: {len(df.columns)}</p>
<h3>Numeric Variables Summary</h3>
<div style="overflow-x: auto;">
{numeric_stats.to_html(classes='table table-striped')}
</div>
<h3>Categorical Variables Summary</h3>
<div style="overflow-x: auto;">
{categorical_stats.to_html(classes='table table-striped')}
</div>
</div>
"""
# Add plots from memory
for plot_name, plot_data in self.plots_memory.items():
html_content += f"""
<div class="plot-container">
<h3>{plot_name.replace('_', ' ').title()}</h3>
<img src="{plot_data}" alt="{plot_name}">
</div>
"""
html_content += "</div>"
return html_content
except Exception as e:
import traceback
error_message = f"""
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
<h3>Error in Analysis</h3>
<p>Error details: {str(e)}</p>
<p>Stack trace:</p>
<pre>{traceback.format_exc()}</pre>
</div>
"""
return error_message
def create_interface():
analyzer = DataAnalyzer()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Data Analysis Dashboard
This dashboard provides comprehensive data analysis and visualization capabilities.
""")
# Store the dataframe in a state variable
current_df = gr.State(None)
with gr.Tabs():
# First Tab: Data Upload & Preview
with gr.TabItem("Data Upload & Preview"):
with gr.Row():
with gr.Column(scale=2):
file_input = gr.File(
label="Upload CSV File",
file_types=[".csv"],
file_count="single"
)
with gr.Column(scale=1):
gr.Markdown("""
### Upload Instructions
1. Select a CSV file
2. File will be automatically loaded
3. Preview will appear below
""")
with gr.Row():
data_info = gr.Markdown("No data uploaded yet")
with gr.Row():
data_preview = gr.Dataframe(
label="Data Preview",
interactive=False,
wrap=True
)
def load_data(file):
if file is None:
return "No data uploaded yet", None, None
try:
df = pd.read_csv(file.name)
info_text = f"""
### Dataset Information
- Rows: {len(df)}
- Columns: {len(df.columns)}
- Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB
- Column Types: {dict(df.dtypes.value_counts())}
"""
return info_text, df.head(10), df
except Exception as e:
return f"Error loading file: {str(e)}", None, None
file_input.change(
fn=load_data,
inputs=[file_input],
outputs=[data_info, data_preview, current_df]
)
# Second Tab: Sweetviz Analysis
with gr.TabItem("Sweetviz Analysis"):
with gr.Row():
with gr.Column(scale=2):
sweetviz_button = gr.Button(
"Generate Sweetviz Report",
variant="primary"
)
with gr.Column(scale=1):
gr.Markdown("""
### Sweetviz Analysis Features
- Comprehensive data profiling
- Statistical analysis
- Feature correlations
- Missing value analysis
""")
with gr.Row():
sweetviz_output = gr.HTML(
label="Sweetviz Report",
value="Click the button above to generate the report"
)
def generate_sweetviz(df):
if df is None:
return "Please upload a dataset first"
try:
return analyzer.generate_sweetviz_report(df)
except Exception as e:
return f"Error generating Sweetviz report: {str(e)}"
sweetviz_button.click(
fn=generate_sweetviz,
inputs=[current_df],
outputs=[sweetviz_output]
)
# Third Tab: Visual Analysis
with gr.TabItem("Visual Analysis"):
with gr.Row():
with gr.Column(scale=2):
viz_button = gr.Button(
"Generate Visualizations",
variant="primary"
)
with gr.Column(scale=1):
gr.Markdown("""
### Visualization Features
- Distribution plots
- Correlation analysis
- Categorical variable analysis
- Statistical summaries
""")
with gr.Row():
viz_output = gr.HTML(
label="Visualization Report",
value="Click the button above to generate visualizations"
)
def generate_viz(df):
if df is None:
return "Please upload a dataset first"
try:
return analyzer.generate_autoviz_report(df)
except Exception as e:
return f"Error generating visualizations: {str(e)}"
viz_button.click(
fn=generate_viz,
inputs=[current_df],
outputs=[viz_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
share=False # Set to True if you want to create a public link
) |