DanielSc4 commited on
Commit
65fa4f5
1 Parent(s): 465ab59

Update on data used

Browse files
Files changed (4) hide show
  1. app.py +34 -4
  2. data/results extended.csv +0 -0
  3. data/results.csv +0 -0
  4. test.ipynb +5 -4
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import os
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
 
6
  import numpy as np
7
  import nltk, spacy, gensim
8
  from sklearn.decomposition import LatentDirichletAllocation
@@ -11,6 +12,26 @@ from pprint import pprint
11
  import matplotlib
12
  matplotlib.use('agg')
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def concat_comments(*kwargs):
15
  return ['\n'.join(ele) for ele in zip(*kwargs)]
16
 
@@ -125,9 +146,16 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
125
  df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
126
  df_topic_keywords
127
 
128
- topics = [
129
- f'Topic {i}' for i in range(len(df_topic_keywords))
130
- ]
 
 
 
 
 
 
 
131
  df_topic_keywords["Topics"] = topics
132
  df_topic_keywords
133
 
@@ -185,6 +213,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
185
 
186
  ax.set_title("% of sarcastic comments for each topic")
187
  plt.xticks(rotation=70)
 
188
  plt.legend()
189
  plt.axhline(0.5, color = 'red', ls=":")
190
 
@@ -232,6 +261,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
232
  ax.set_title("% of topics for each subreddit")
233
  ax.legend(loc="upper right")
234
  plt.xticks(rotation=50)
 
235
 
236
  print('[v] All looking good!')
237
 
@@ -245,7 +275,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
245
 
246
  with gr.Blocks() as demo:
247
  gr.Markdown("# Dashboard per l'analisi con LDA")
248
- gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono pi霉 propensi a commenti di tipo sarcastico")
249
  # gradio.Dataframe(路路路)
250
 
251
  inputs = []
 
3
  import os
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
6
+ import gensim.downloader as api
7
  import numpy as np
8
  import nltk, spacy, gensim
9
  from sklearn.decomposition import LatentDirichletAllocation
 
12
  import matplotlib
13
  matplotlib.use('agg')
14
 
15
+ print("[x] Downloading word 2 vec")
16
+ model_w2v = api.load("word2vec-google-news-300")
17
+
18
+ def average_word2vec(word_list: list[str]):
19
+ # model_w2v = api.load("word2vec-google-news-300")
20
+ word_vectors = []
21
+ for word in word_list:
22
+ if word in model_w2v:
23
+ word_vectors.append(model_w2v[word])
24
+ if word_vectors:
25
+ average_vector = np.mean(word_vectors, axis=0)
26
+ else:
27
+ return None
28
+
29
+ most_similar_word = model_w2v.similar_by_vector(average_vector, topn=1)
30
+ word, similarity = most_similar_word[0]
31
+
32
+ return word, similarity
33
+
34
+
35
  def concat_comments(*kwargs):
36
  return ['\n'.join(ele) for ele in zip(*kwargs)]
37
 
 
146
  df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
147
  df_topic_keywords
148
 
149
+ # topics = [
150
+ # f'Topic {i}' for i in range(len(df_topic_keywords))
151
+ # ]
152
+
153
+ topics = []
154
+ for i, row in df_topic_keywords.iterrows():
155
+ topics.append(
156
+ average_word2vec(row.to_list()[:5])[0]
157
+ )
158
+
159
  df_topic_keywords["Topics"] = topics
160
  df_topic_keywords
161
 
 
213
 
214
  ax.set_title("% of sarcastic comments for each topic")
215
  plt.xticks(rotation=70)
216
+ ax.set_ylim(bottom = 0, top = 1.02)
217
  plt.legend()
218
  plt.axhline(0.5, color = 'red', ls=":")
219
 
 
261
  ax.set_title("% of topics for each subreddit")
262
  ax.legend(loc="upper right")
263
  plt.xticks(rotation=50)
264
+ ax.set_ylim(bottom = 0, top = 1.02)
265
 
266
  print('[v] All looking good!')
267
 
 
275
 
276
  with gr.Blocks() as demo:
277
  gr.Markdown("# Dashboard per l'analisi con LDA")
278
+ gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali (dominant) topic sono pi霉 propensi a commenti di tipo sarcastico")
279
  # gradio.Dataframe(路路路)
280
 
281
  inputs = []
data/results extended.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
test.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -12,7 +12,7 @@
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 2,
16
  "metadata": {},
17
  "outputs": [
18
  {
@@ -241,13 +241,14 @@
241
  "[5000 rows x 9 columns]"
242
  ]
243
  },
244
- "execution_count": 2,
245
  "metadata": {},
246
  "output_type": "execute_result"
247
  }
248
  ],
249
  "source": [
250
- "pd.read_csv('./data/results.csv', index_col=0)"
 
251
  ]
252
  },
253
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 24,
16
  "metadata": {},
17
  "outputs": [
18
  {
 
241
  "[5000 rows x 9 columns]"
242
  ]
243
  },
244
+ "execution_count": 24,
245
  "metadata": {},
246
  "output_type": "execute_result"
247
  }
248
  ],
249
  "source": [
250
+ "d = pd.read_csv('./data/results extended.csv', index_col=0)\n",
251
+ "d"
252
  ]
253
  },
254
  {