Spaces:

DanielSc4
/

DataAnalyticsNLP

Runtime error

App Files Files Community

DanielSc4 commited on Sep 14, 2023

Commit

65fa4f5

1 Parent(s): 465ab59

Update on data used

Browse files

Files changed (4) hide show

app.py +34 -4
data/results extended.csv +0 -0
data/results.csv +0 -0
test.ipynb +5 -4

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 import os
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import nltk, spacy, gensim
 from sklearn.decomposition import LatentDirichletAllocation
@@ -11,6 +12,26 @@ from pprint import pprint
 import matplotlib
 matplotlib.use('agg')
 def concat_comments(*kwargs):
     return ['\n'.join(ele) for ele in zip(*kwargs)]
@@ -125,9 +146,16 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
     df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
     df_topic_keywords
-    topics = [
-        f'Topic {i}' for i in range(len(df_topic_keywords))
-    ]
     df_topic_keywords["Topics"] = topics
     df_topic_keywords
@@ -185,6 +213,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
     ax.set_title("% of sarcastic comments for each topic")
     plt.xticks(rotation=70)
     plt.legend()
     plt.axhline(0.5, color = 'red', ls=":")
@@ -232,6 +261,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
     ax.set_title("% of topics for each subreddit")
     ax.legend(loc="upper right")
     plt.xticks(rotation=50)
     print('[v] All looking good!')
@@ -245,7 +275,7 @@ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
 with gr.Blocks() as demo:
     gr.Markdown("# Dashboard per l'analisi con LDA")
-    gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono più propensi a commenti di tipo sarcastico")
     # gradio.Dataframe(···)
     inputs = []

 import os
 import pandas as pd
 import matplotlib.pyplot as plt
+import gensim.downloader as api
 import numpy as np
 import nltk, spacy, gensim
 from sklearn.decomposition import LatentDirichletAllocation
 import matplotlib
 matplotlib.use('agg')
+print("[x] Downloading word 2 vec")
+model_w2v = api.load("word2vec-google-news-300")
+def average_word2vec(word_list: list[str]):
+    # model_w2v = api.load("word2vec-google-news-300")
+    word_vectors = []
+    for word in word_list:
+        if word in model_w2v:
+            word_vectors.append(model_w2v[word])
+    if word_vectors:
+        average_vector = np.mean(word_vectors, axis=0)
+    else:
+        return None
+    most_similar_word = model_w2v.similar_by_vector(average_vector, topn=1)
+    word, similarity = most_similar_word[0]
+    return word, similarity
 def concat_comments(*kwargs):
     return ['\n'.join(ele) for ele in zip(*kwargs)]
     df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
     df_topic_keywords
+    # topics = [
+    #     f'Topic {i}' for i in range(len(df_topic_keywords))
+    # ]
+    topics = []
+    for i, row in df_topic_keywords.iterrows():
+        topics.append(
+            average_word2vec(row.to_list()[:5])[0]
+        )
     df_topic_keywords["Topics"] = topics
     df_topic_keywords
     ax.set_title("% of sarcastic comments for each topic")
     plt.xticks(rotation=70)
+    ax.set_ylim(bottom = 0, top = 1.02)
     plt.legend()
     plt.axhline(0.5, color = 'red', ls=":")
     ax.set_title("% of topics for each subreddit")
     ax.legend(loc="upper right")
     plt.xticks(rotation=50)
+    ax.set_ylim(bottom = 0, top = 1.02)
     print('[v] All looking good!')
 with gr.Blocks() as demo:
     gr.Markdown("# Dashboard per l'analisi con LDA")
+    gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali (dominant) topic sono più propensi a commenti di tipo sarcastico")
     # gradio.Dataframe(···)
     inputs = []

data/results extended.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

test.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -241,13 +241,14 @@
        "[5000 rows x 9 columns]"
       ]
      },
-     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pd.read_csv('./data/results.csv', index_col=0)"
    ]
   },
   {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
        "[5000 rows x 9 columns]"
       ]
      },
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "d = pd.read_csv('./data/results extended.csv', index_col=0)\n",
+    "d"
    ]
   },
   {