Spaces:
Runtime error
Runtime error
Update on data used
Browse files- app.py +34 -4
- data/results extended.csv +0 -0
- data/results.csv +0 -0
- test.ipynb +5 -4
app.py
CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
|
|
3 |
import os
|
4 |
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
|
|
6 |
import numpy as np
|
7 |
import nltk, spacy, gensim
|
8 |
from sklearn.decomposition import LatentDirichletAllocation
|
@@ -11,6 +12,26 @@ from pprint import pprint
|
|
11 |
import matplotlib
|
12 |
matplotlib.use('agg')
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def concat_comments(*kwargs):
|
15 |
return ['\n'.join(ele) for ele in zip(*kwargs)]
|
16 |
|
@@ -125,9 +146,16 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
|
|
125 |
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
|
126 |
df_topic_keywords
|
127 |
|
128 |
-
topics = [
|
129 |
-
|
130 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
df_topic_keywords["Topics"] = topics
|
132 |
df_topic_keywords
|
133 |
|
@@ -185,6 +213,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
|
|
185 |
|
186 |
ax.set_title("% of sarcastic comments for each topic")
|
187 |
plt.xticks(rotation=70)
|
|
|
188 |
plt.legend()
|
189 |
plt.axhline(0.5, color = 'red', ls=":")
|
190 |
|
@@ -232,6 +261,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
|
|
232 |
ax.set_title("% of topics for each subreddit")
|
233 |
ax.legend(loc="upper right")
|
234 |
plt.xticks(rotation=50)
|
|
|
235 |
|
236 |
print('[v] All looking good!')
|
237 |
|
@@ -245,7 +275,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
|
|
245 |
|
246 |
with gr.Blocks() as demo:
|
247 |
gr.Markdown("# Dashboard per l'analisi con LDA")
|
248 |
-
gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono pi霉 propensi a commenti di tipo sarcastico")
|
249 |
# gradio.Dataframe(路路路)
|
250 |
|
251 |
inputs = []
|
|
|
3 |
import os
|
4 |
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
6 |
+
import gensim.downloader as api
|
7 |
import numpy as np
|
8 |
import nltk, spacy, gensim
|
9 |
from sklearn.decomposition import LatentDirichletAllocation
|
|
|
12 |
import matplotlib
|
13 |
matplotlib.use('agg')
|
14 |
|
15 |
+
print("[x] Downloading word 2 vec")
|
16 |
+
model_w2v = api.load("word2vec-google-news-300")
|
17 |
+
|
18 |
+
def average_word2vec(word_list: list[str]):
|
19 |
+
# model_w2v = api.load("word2vec-google-news-300")
|
20 |
+
word_vectors = []
|
21 |
+
for word in word_list:
|
22 |
+
if word in model_w2v:
|
23 |
+
word_vectors.append(model_w2v[word])
|
24 |
+
if word_vectors:
|
25 |
+
average_vector = np.mean(word_vectors, axis=0)
|
26 |
+
else:
|
27 |
+
return None
|
28 |
+
|
29 |
+
most_similar_word = model_w2v.similar_by_vector(average_vector, topn=1)
|
30 |
+
word, similarity = most_similar_word[0]
|
31 |
+
|
32 |
+
return word, similarity
|
33 |
+
|
34 |
+
|
35 |
def concat_comments(*kwargs):
|
36 |
return ['\n'.join(ele) for ele in zip(*kwargs)]
|
37 |
|
|
|
146 |
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
|
147 |
df_topic_keywords
|
148 |
|
149 |
+
# topics = [
|
150 |
+
# f'Topic {i}' for i in range(len(df_topic_keywords))
|
151 |
+
# ]
|
152 |
+
|
153 |
+
topics = []
|
154 |
+
for i, row in df_topic_keywords.iterrows():
|
155 |
+
topics.append(
|
156 |
+
average_word2vec(row.to_list()[:5])[0]
|
157 |
+
)
|
158 |
+
|
159 |
df_topic_keywords["Topics"] = topics
|
160 |
df_topic_keywords
|
161 |
|
|
|
213 |
|
214 |
ax.set_title("% of sarcastic comments for each topic")
|
215 |
plt.xticks(rotation=70)
|
216 |
+
ax.set_ylim(bottom = 0, top = 1.02)
|
217 |
plt.legend()
|
218 |
plt.axhline(0.5, color = 'red', ls=":")
|
219 |
|
|
|
261 |
ax.set_title("% of topics for each subreddit")
|
262 |
ax.legend(loc="upper right")
|
263 |
plt.xticks(rotation=50)
|
264 |
+
ax.set_ylim(bottom = 0, top = 1.02)
|
265 |
|
266 |
print('[v] All looking good!')
|
267 |
|
|
|
275 |
|
276 |
with gr.Blocks() as demo:
|
277 |
gr.Markdown("# Dashboard per l'analisi con LDA")
|
278 |
+
gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali (dominant) topic sono pi霉 propensi a commenti di tipo sarcastico")
|
279 |
# gradio.Dataframe(路路路)
|
280 |
|
281 |
inputs = []
|
data/results extended.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/results.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
test.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -12,7 +12,7 @@
|
|
12 |
},
|
13 |
{
|
14 |
"cell_type": "code",
|
15 |
-
"execution_count":
|
16 |
"metadata": {},
|
17 |
"outputs": [
|
18 |
{
|
@@ -241,13 +241,14 @@
|
|
241 |
"[5000 rows x 9 columns]"
|
242 |
]
|
243 |
},
|
244 |
-
"execution_count":
|
245 |
"metadata": {},
|
246 |
"output_type": "execute_result"
|
247 |
}
|
248 |
],
|
249 |
"source": [
|
250 |
-
"pd.read_csv('./data/results.csv', index_col=0)"
|
|
|
251 |
]
|
252 |
},
|
253 |
{
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
12 |
},
|
13 |
{
|
14 |
"cell_type": "code",
|
15 |
+
"execution_count": 24,
|
16 |
"metadata": {},
|
17 |
"outputs": [
|
18 |
{
|
|
|
241 |
"[5000 rows x 9 columns]"
|
242 |
]
|
243 |
},
|
244 |
+
"execution_count": 24,
|
245 |
"metadata": {},
|
246 |
"output_type": "execute_result"
|
247 |
}
|
248 |
],
|
249 |
"source": [
|
250 |
+
"d = pd.read_csv('./data/results extended.csv', index_col=0)\n",
|
251 |
+
"d"
|
252 |
]
|
253 |
},
|
254 |
{
|