MaroueneA commited on
Commit
c144351
1 Parent(s): 0a5ddc0

Update same as GradioCompararion 2 and set tmp dir if not existent

Browse files
Files changed (1) hide show
  1. app.py +88 -11
app.py CHANGED
@@ -3,28 +3,28 @@ import pandas as pd
3
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
  from datasets import load_dataset
5
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
6
- from sklearn.cluster import KMeans
7
  import torch
8
  from sentence_transformers import SentenceTransformer
9
  import umap
10
  from sklearn.manifold import TSNE
11
  import matplotlib.pyplot as plt
12
- import seaborn as sns
13
  import numpy as np
 
14
  import tempfile
15
  from collections import Counter
16
  import os
17
- import tempfile
18
 
19
- temp_dir = tempfile.gettempdir()
 
20
  os.environ['GRADIO_TEMP_DIR'] = temp_dir
21
 
 
22
  # Load the models and their tokenizers
23
  model_paths = {
24
  "roberta-base-offensive": "./models/roberta-base-offensive",
25
  "distilbert-base-uncased-offensive": "./models/distilbert-base-uncased-offensive",
26
- "bert-offensive": "./models/bert-offensive",
27
- "deberta-offensive": "./models/deberta-offensive"
28
  }
29
 
30
  models = {name: AutoModelForSequenceClassification.from_pretrained(path) for name, path in model_paths.items()}
@@ -67,10 +67,15 @@ def generate_confusion_matrix(conf_matrix, model_name):
67
  def generate_embeddings_and_plot(categories):
68
  all_texts = sum(categories.values(), [])
69
  embeddings = model_embedding.encode(all_texts)
 
 
70
  umap_reducer = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine')
71
  umap_embeddings = umap_reducer.fit_transform(embeddings)
 
 
72
  tsne_embeddings = TSNE(n_components=2, perplexity=30).fit_transform(embeddings)
73
 
 
74
  def plot_embeddings(embeddings, title, file_suffix):
75
  plt.figure(figsize=(10, 8))
76
  colors = {"correct_both": "green", "incorrect_both": "red", "correct_model1_only": "blue", "correct_model2_only": "orange"}
@@ -81,13 +86,16 @@ def generate_embeddings_and_plot(categories):
81
  plt.title(title)
82
  plt.xlabel('Component 1')
83
  plt.ylabel('Component 2')
 
84
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f'_{file_suffix}.png')
85
  plt.savefig(temp_file.name)
86
  plt.close()
87
  return temp_file.name
88
 
 
89
  umap_plot_path = plot_embeddings(umap_embeddings, "UMAP Projection of Text Categories", "umap")
90
  tsne_plot_path = plot_embeddings(tsne_embeddings, "t-SNE Projection of Text Categories", "tsne")
 
91
  return umap_plot_path, tsne_plot_path
92
 
93
  def compare_models(model1, model2):
@@ -138,6 +146,55 @@ def compare_models(model1, model2):
138
 
139
  return metrics_df, conf_matrix_path1, conf_matrix_path2, umap_plot_path, tsne_plot_path, categories
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  def setup_gradio_interface():
142
  with gr.Blocks() as demo:
143
  gr.Markdown("## Model Comparison and Text Analysis")
@@ -155,19 +212,39 @@ def setup_gradio_interface():
155
  with gr.Row():
156
  umap_visualization_output = gr.Image(label="UMAP Text Categorization Visualization")
157
  tsne_visualization_output = gr.Image(label="t-SNE Text Categorization Visualization")
 
 
 
 
 
158
 
159
  def update_interface(model1, model2):
160
- metrics_df, conf_matrix1, conf_matrix2 = compare_models(model1, model2)
161
- umap_plot_path, tsne_plot_path = generate_embeddings_and_plot(metrics_df)
162
- return metrics_df, conf_matrix1, conf_matrix2, umap_plot_path, tsne_plot_path
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
 
 
164
  submit_button.click(
165
  update_interface,
166
  inputs=[model1_input, model2_input],
167
- outputs=[metrics_output, model1_cm_output, model2_cm_output, umap_visualization_output, tsne_visualization_output]
168
  )
169
 
170
  return demo
171
 
172
  demo = setup_gradio_interface()
173
- demo.launch(share=True)
 
3
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
  from datasets import load_dataset
5
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
 
6
  import torch
7
  from sentence_transformers import SentenceTransformer
8
  import umap
9
  from sklearn.manifold import TSNE
10
  import matplotlib.pyplot as plt
 
11
  import numpy as np
12
+ import seaborn as sns
13
  import tempfile
14
  from collections import Counter
15
  import os
 
16
 
17
+ temp_dir = '/tmp/gradio_tmp'
18
+ os.makedirs(temp_dir, exist_ok=True) # Creates the directory if it does not exist
19
  os.environ['GRADIO_TEMP_DIR'] = temp_dir
20
 
21
+
22
  # Load the models and their tokenizers
23
  model_paths = {
24
  "roberta-base-offensive": "./models/roberta-base-offensive",
25
  "distilbert-base-uncased-offensive": "./models/distilbert-base-uncased-offensive",
26
+ "bert-offensive":"./models/bert-offensive",
27
+ "deberta-offensive":"./models/deberta-offensive"
28
  }
29
 
30
  models = {name: AutoModelForSequenceClassification.from_pretrained(path) for name, path in model_paths.items()}
 
67
  def generate_embeddings_and_plot(categories):
68
  all_texts = sum(categories.values(), [])
69
  embeddings = model_embedding.encode(all_texts)
70
+
71
+ # UMAP reduction
72
  umap_reducer = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine')
73
  umap_embeddings = umap_reducer.fit_transform(embeddings)
74
+
75
+ # t-SNE reduction
76
  tsne_embeddings = TSNE(n_components=2, perplexity=30).fit_transform(embeddings)
77
 
78
+ # Plotting helper function to avoid repetition
79
  def plot_embeddings(embeddings, title, file_suffix):
80
  plt.figure(figsize=(10, 8))
81
  colors = {"correct_both": "green", "incorrect_both": "red", "correct_model1_only": "blue", "correct_model2_only": "orange"}
 
86
  plt.title(title)
87
  plt.xlabel('Component 1')
88
  plt.ylabel('Component 2')
89
+
90
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f'_{file_suffix}.png')
91
  plt.savefig(temp_file.name)
92
  plt.close()
93
  return temp_file.name
94
 
95
+ # Generate and save plots
96
  umap_plot_path = plot_embeddings(umap_embeddings, "UMAP Projection of Text Categories", "umap")
97
  tsne_plot_path = plot_embeddings(tsne_embeddings, "t-SNE Projection of Text Categories", "tsne")
98
+
99
  return umap_plot_path, tsne_plot_path
100
 
101
  def compare_models(model1, model2):
 
146
 
147
  return metrics_df, conf_matrix_path1, conf_matrix_path2, umap_plot_path, tsne_plot_path, categories
148
 
149
+
150
+ from sklearn.cluster import KMeans
151
+
152
+ def generate_embeddings_and_cluster(categories):
153
+ all_texts = sum(categories.values(), [])
154
+ embeddings = model_embedding.encode(all_texts)
155
+
156
+ # Category labels for all texts
157
+ category_labels = [cat for cat, texts in categories.items() for _ in range(len(texts))]
158
+
159
+ # Calculate overall category distribution
160
+ overall_distribution = Counter(category_labels)
161
+ overall_distribution_percent = {k: v / len(category_labels) * 100 for k, v in overall_distribution.items()}
162
+
163
+ # K-means clustering
164
+ kmeans = KMeans(n_clusters=3, random_state=42).fit(embeddings)
165
+ labels = kmeans.labels_
166
+
167
+ # Map each text to its cluster and category
168
+ cluster_categories = [[] for _ in range(3)] # Assuming 3 clusters
169
+ for label, category in zip(labels, category_labels):
170
+ cluster_categories[label].append(category)
171
+
172
+ # Calculate category distribution within each cluster
173
+ cluster_distributions = []
174
+ for i, cluster in enumerate(cluster_categories):
175
+ distribution = Counter(cluster)
176
+ distribution_percent = {k: v / len(cluster) * 100 for k, v in distribution.items()}
177
+ cluster_distributions.append(distribution_percent)
178
+
179
+ # Perform UMAP dimensionality reduction for visualization
180
+ umap_reducer = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine')
181
+ reduced_embeddings = umap_reducer.fit_transform(embeddings)
182
+
183
+ # Visualization
184
+ plt.figure(figsize=(10, 8))
185
+ scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis', alpha=0.6)
186
+ plt.legend(*scatter.legend_elements(), title="Clusters")
187
+ plt.title("K-means Clustering of Text Embeddings")
188
+ plt.xlabel('UMAP 1')
189
+ plt.ylabel('UMAP 2')
190
+
191
+ # Save the plot
192
+ cluster_plot_path = tempfile.NamedTemporaryFile(delete=False, suffix='_cluster.png').name
193
+ plt.savefig(cluster_plot_path)
194
+ plt.close()
195
+
196
+ return cluster_plot_path, overall_distribution_percent, cluster_distributions
197
+
198
  def setup_gradio_interface():
199
  with gr.Blocks() as demo:
200
  gr.Markdown("## Model Comparison and Text Analysis")
 
212
  with gr.Row():
213
  umap_visualization_output = gr.Image(label="UMAP Text Categorization Visualization")
214
  tsne_visualization_output = gr.Image(label="t-SNE Text Categorization Visualization")
215
+
216
+ clustering_visualization_output = gr.Image(label="K-means Clustering Visualization")
217
+
218
+ category_distribution_output = gr.Dataframe(label="Category Distribution Comparison")
219
+
220
 
221
  def update_interface(model1, model2):
222
+ metrics_df, cm_path1, cm_path2, umap_viz_path, tsne_viz_path, categories = compare_models(model1, model2)
223
+ cluster_viz_path, overall_distribution_percent, cluster_distributions = generate_embeddings_and_cluster(categories)
224
+
225
+ # Prepare DataFrame for category distribution comparison
226
+ distribution_data = []
227
+ for cluster_index, cluster_distribution in enumerate(cluster_distributions, start=1):
228
+ for category, percent in cluster_distribution.items():
229
+ distribution_data.append({
230
+ "Cluster": f"Cluster {cluster_index}",
231
+ "Category": category,
232
+ "Percentage": f"{percent:.2f}%",
233
+ "Difference from Overall": f"{percent - overall_distribution_percent.get(category, 0):.2f}%"
234
+ })
235
+ distribution_df = pd.DataFrame(distribution_data)
236
+
237
+ return metrics_df, cm_path1, cm_path2, umap_viz_path, tsne_viz_path, cluster_viz_path, distribution_df
238
 
239
+
240
+
241
  submit_button.click(
242
  update_interface,
243
  inputs=[model1_input, model2_input],
244
+ outputs=[metrics_output, model1_cm_output, model2_cm_output, umap_visualization_output, tsne_visualization_output, clustering_visualization_output, category_distribution_output]
245
  )
246
 
247
  return demo
248
 
249
  demo = setup_gradio_interface()
250
+ demo.launch(share=True)