baconnier commited on
Commit
771365f
·
verified ·
1 Parent(s): 08dc322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -8
app.py CHANGED
@@ -3,12 +3,22 @@ import pandas as pd
3
  import sweetviz as sv
4
  import tempfile
5
  import os
 
 
 
 
 
 
 
 
6
 
7
  class DataAnalyzer:
8
  def __init__(self):
9
  self.temp_dir = tempfile.mkdtemp()
10
-
 
11
  def generate_sweetviz_report(self, df):
 
12
  report = sv.analyze(df)
13
  report_path = os.path.join(self.temp_dir, "report.html")
14
  report.show_html(report_path, open_browser=False)
@@ -16,7 +26,6 @@ class DataAnalyzer:
16
  with open(report_path, 'r', encoding='utf-8') as f:
17
  html_content = f.read()
18
 
19
- # Wrap the report in a table cell with styling
20
  html_with_table = f"""
21
  <table width="100%" style="border-collapse: collapse;">
22
  <tr>
@@ -32,6 +41,63 @@ class DataAnalyzer:
32
  os.remove(report_path)
33
  return html_with_table
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def create_interface():
36
  analyzer = DataAnalyzer()
37
 
@@ -43,23 +109,58 @@ def create_interface():
43
  file_input = gr.File(label="Upload CSV")
44
  report_html = gr.HTML()
45
 
46
- with gr.TabItem("Custom Analysis"):
47
- gr.Markdown("Custom analysis will be added here")
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def process_file(file):
50
  if file is None:
51
- return None
52
 
53
  try:
54
  df = pd.read_csv(file.name)
55
- return analyzer.generate_sweetviz_report(df)
 
 
56
  except Exception as e:
57
- return f"Error generating report: {str(e)}"
 
 
 
 
 
 
 
 
 
58
 
59
  file_input.change(
60
  fn=process_file,
61
  inputs=[file_input],
62
- outputs=[report_html]
 
 
 
 
 
 
 
 
 
 
 
 
63
  )
64
 
65
  return demo
 
3
  import sweetviz as sv
4
  import tempfile
5
  import os
6
+ import category_encoders as ce
7
+ import umap
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.preprocessing import StandardScaler
10
+ import seaborn as sns
11
+ import numpy as np
12
+ import io
13
+ import base64
14
 
15
  class DataAnalyzer:
16
  def __init__(self):
17
  self.temp_dir = tempfile.mkdtemp()
18
+ self.df = None
19
+
20
  def generate_sweetviz_report(self, df):
21
+ self.df = df # Store DataFrame for other analyses
22
  report = sv.analyze(df)
23
  report_path = os.path.join(self.temp_dir, "report.html")
24
  report.show_html(report_path, open_browser=False)
 
26
  with open(report_path, 'r', encoding='utf-8') as f:
27
  html_content = f.read()
28
 
 
29
  html_with_table = f"""
30
  <table width="100%" style="border-collapse: collapse;">
31
  <tr>
 
41
  os.remove(report_path)
42
  return html_with_table
43
 
44
+ def encode_and_visualize(self, column_name, encoder_type='binary'):
45
+ if self.df is None or column_name not in self.df.columns:
46
+ return None
47
+
48
+ # Create DataFrame with only the selected column
49
+ df_subset = self.df[[column_name]].copy()
50
+
51
+ # Select encoder
52
+ encoders = {
53
+ 'binary': ce.BinaryEncoder(),
54
+ 'onehot': ce.OneHotEncoder(),
55
+ 'catboost': ce.CatBoostEncoder(),
56
+ 'count': ce.CountEncoder()
57
+ }
58
+
59
+ encoder = encoders.get(encoder_type)
60
+
61
+ # Encode data
62
+ encoded_df = encoder.fit_transform(df_subset)
63
+
64
+ # Scale the encoded features
65
+ scaler = StandardScaler()
66
+ scaled_data = scaler.fit_transform(encoded_df)
67
+
68
+ # Apply UMAP
69
+ reducer = umap.UMAP(
70
+ n_neighbors=15,
71
+ min_dist=0.1,
72
+ n_components=2,
73
+ random_state=42
74
+ )
75
+
76
+ embedding = reducer.fit_transform(scaled_data)
77
+
78
+ # Create visualization
79
+ plt.figure(figsize=(10, 6))
80
+ scatter = plt.scatter(
81
+ embedding[:, 0],
82
+ embedding[:, 1],
83
+ c=pd.factorize(df_subset[column_name])[0],
84
+ cmap='viridis',
85
+ alpha=0.6
86
+ )
87
+
88
+ plt.colorbar(scatter)
89
+ plt.title(f'UMAP visualization of {column_name}\nusing {encoder_type} encoding')
90
+ plt.xlabel('UMAP1')
91
+ plt.ylabel('UMAP2')
92
+
93
+ # Save plot to bytes
94
+ buf = io.BytesIO()
95
+ plt.savefig(buf, format='png', bbox_inches='tight')
96
+ plt.close()
97
+ buf.seek(0)
98
+
99
+ return buf
100
+
101
  def create_interface():
102
  analyzer = DataAnalyzer()
103
 
 
109
  file_input = gr.File(label="Upload CSV")
110
  report_html = gr.HTML()
111
 
112
+ with gr.TabItem("Categorical Analysis"):
113
+ with gr.Row():
114
+ column_dropdown = gr.Dropdown(
115
+ label="Select Categorical Column",
116
+ choices=[],
117
+ interactive=True
118
+ )
119
+ encoder_dropdown = gr.Dropdown(
120
+ label="Select Encoder",
121
+ choices=['binary', 'onehot', 'catboost', 'count'],
122
+ value='binary',
123
+ interactive=True
124
+ )
125
+ plot_output = gr.Image(label="UMAP Visualization")
126
 
127
  def process_file(file):
128
  if file is None:
129
+ return None, gr.Dropdown(choices=[])
130
 
131
  try:
132
  df = pd.read_csv(file.name)
133
+ # Get categorical columns
134
+ cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
135
+ return analyzer.generate_sweetviz_report(df), gr.Dropdown(choices=cat_columns)
136
  except Exception as e:
137
+ return f"Error generating report: {str(e)}", gr.Dropdown(choices=[])
138
+
139
+ def update_plot(column, encoder_type):
140
+ if column is None:
141
+ return None
142
+ try:
143
+ plot_bytes = analyzer.encode_and_visualize(column, encoder_type)
144
+ return plot_bytes
145
+ except Exception as e:
146
+ return None
147
 
148
  file_input.change(
149
  fn=process_file,
150
  inputs=[file_input],
151
+ outputs=[report_html, column_dropdown]
152
+ )
153
+
154
+ column_dropdown.change(
155
+ fn=update_plot,
156
+ inputs=[column_dropdown, encoder_dropdown],
157
+ outputs=[plot_output]
158
+ )
159
+
160
+ encoder_dropdown.change(
161
+ fn=update_plot,
162
+ inputs=[column_dropdown, encoder_dropdown],
163
+ outputs=[plot_output]
164
  )
165
 
166
  return demo