Spaces:

karuniaperjuangan
/

sentiment-analysis-distilbert-indo

Runtime error

App Files Files Community

karuniaperjuangan commited on Oct 9, 2022

Commit

83aaeb2

•

1 Parent(s): e72b51f

Upload 4 files

Browse files

Files changed (4) hide show

LICENSE +21 -0
README.md +17 -12
app.py +166 -0
requirements.txt +8 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 karuniaperjuangan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,17 @@
----
-title: Sentiment Analysis
-emoji: 💩
-colorFrom: gray
-colorTo: indigo
-sdk: gradio
-sdk_version: 3.4.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Aplikasi Sentiment Analysis Twitter <br>
+Anggota Kelompok : <br>
+Karunia Perjuangan Mustadl'afin - 20/456368/TK/50498 <br>
+Pramudya Kusuma Hardika - 20/460558/TK/51147 <br>
+Aplikasi ini adalah aplikasi Sentiment Analysis yang bisa digunakan untuk melihat suatu tren yang ada di Twitter. Tahapan yang digunakan untuk melakukan Sentiment Analysis adalah:
+1. Melakukan Scraping tweet-tweet yang sesuai dengan keyword yang diinputkan menggunakan SNScrape
+2. Melakukan Sentiment Analysis setiap tweet dengan bantuan model DistilBERT yang sudah difinetune dengan menggunakan dataset SMSA IndoNLU
+3. Membuat Plot Pie Chart Tren Sentiment Analysis
+Alasan kenapa kami memilih Twitter sebagai tempat analisis sentiment adalah kecepatan updatenya suatu isu di Twitter yang mendahului jejaring sosial lain seperti Facebook dan LinkedIn.
+Link Slides Presentasi : <br>
+https://www.canva.com/design/DAFOdCxwqwQ/83ebYKLdRWSn2DiBH_9HTQ/edit?utm_content=DAFOdCxwqwQ&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import gradio as gr # Untuk UI
+from transformers import pipeline
+import pandas as pd
+from torch.utils.data import Dataset, DataLoader
+import torch
+import gc
+import re
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import snscrape.modules.twitter as sntwitter
+import datetime as dt
+import sys
+import os
+def scrape_tweets(query, max_tweets=-1,output_path="./scraper/output/" ):
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    output_path = os.path.join(output_path,dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+"-"+str(query)+".csv")
+    tweets_list = []
+    if sys.version_info.minor>=8:
+        try:
+            for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
+                if max_tweets != -1 and i >= int(max_tweets):
+                    break
+                tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.likeCount, tweet.retweetCount, tweet.replyCount, tweet.quoteCount, tweet.url, tweet.lang])
+        except KeyboardInterrupt:
+            print("Scraping berhenti atas permintaan pengguna")
+        df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Likes', 'Retweets', 'Replies', 'Quotes', 'URL', 'Language'])
+        print("Tweet berbahasa Indonesia :",len(df[df["Language"] == "in"]),"/",len(tweets_list))
+        df = df[df["Language"] == "in"]
+    #Karena Google Colab menggunakan versi 3.7, library scrape yang digunakan adalah versi lawas yang tidak lengkap, sehingga kita tidak bisa melakukan filter bahasa Indonesia
+    else:
+        print("Using older version of Python")
+        try:
+            for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
+                if max_tweets != -1 and i >= int(max_tweets):
+                    break
+                tweets_list.append([tweet.date, tweet.id, tweet.content])
+        except KeyboardInterrupt:
+            print("Scraping berhenti atas permintaan pengguna")
+        df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text'])
+    df.to_csv(output_path, index=False)
+    print("Data tweet tersimpan di",output_path)
+    return df
+def remove_unnecessary_char(text):
+  text = re.sub("\[USERNAME\]", " ", text)
+  text = re.sub("\[URL\]", " ", text)
+  text = re.sub("\[SENSITIVE-NO\]", " ", text)
+  text = re.sub('  +', ' ', text)
+  return text
+def preprocess_tweet(text):
+  text = re.sub('\n',' ',text) # Remove every '\n'
+  # text = re.sub('rt',' ',text) # Remove every retweet symbol
+  text = re.sub('^(\@\w+ ?)+',' ',text)
+  text = re.sub(r'\@\w+',' ',text) # Remove every username
+  text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
+  text = re.sub('/', ' ', text)
+  # text = re.sub(r'[^\w\s]', '', text)
+  text = re.sub('  +', ' ', text) # Remove extra spaces
+  return text
+def remove_nonaplhanumeric(text):
+  text = re.sub('[^0-9a-zA-Z]+', ' ', text)
+  return text
+def preprocess_text(text):
+  text = preprocess_tweet(text)
+  text = remove_unnecessary_char(text)
+  text = remove_nonaplhanumeric(text)
+  text = text.lower()
+  return text
+predict = pipeline('text-classification',
+                   model='karuniaperjuangan/smsa-distilbert-indo',
+                    device=0 if torch.cuda.is_available() else -1)
+def analyze_df_sentiment(df, batch_size):
+    text_list = list(df["Text"].astype(str).values)
+    text_list_batches = [text_list[i:i+batch_size] for i in range(0,len(text_list),batch_size)] # Memisahkan berdasar batch size dengan bantuan zip ()
+    predictions = []
+    for batch in tqdm(text_list_batches):
+        batch_predictions = predict(batch)
+        predictions += batch_predictions
+    df["Label"] = [pred["label"] for pred in predictions]
+    df["Score"] = [pred["score"] for pred in predictions]
+    return df
+def keyword_analyzer(keyword, max_tweets, batch_size=16):
+    print("Scraping tweets...")
+    df = scrape_tweets(keyword, max_tweets=max_tweets)
+    df["Text"] = df["Text"].apply(preprocess_text)
+    print("Analyzing sentiment...")
+    df = analyze_df_sentiment(df, batch_size=batch_size)
+    fig = plt.figure()
+    df.groupby(["Label"])["Text"].count().plot.pie(autopct="%.1f%%", figsize=(6,6))
+    return fig, df[["Text", "Label", "Score"]]
+with gr.Blocks() as demo:
+    gr.Markdown("""<h1 style="text-align:center">Aplikasi Sentiment Analysis Keyword Twitter </h1>""")
+    gr.Markdown(
+        """
+        Aplikasi ini digunakan untuk melakukan sentimen analisis terhadap data di Twitter menggunakan model DistilBERT. Terdapat 2 mode yang dapat digunakan:
+        1. Trend/Keyword: Untuk melakukan analisis terhadap semua tweet yang mengandung keyword yang diinputkan
+        2. Tweet: Untuk melakukan analisis terhadap sebuah tweet yang diinputkan
+        """
+        )
+    with gr.Tab("Trend/Keyword"):
+        gr.Markdown("""Masukkan keyword dan jumlah maksimum tweet yang ingin diambil""")
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    keyword_textbox = gr.Textbox(lines=1, label="Keyword")
+                    max_tweets_component = gr.Number(value=-1, label="Tweet Maksimal yang akan discrape (-1 jika ingin mengscrape semua tweet)", precision=0)
+                    batch_size_component = gr.Number(value=16, label="Batch Size (Semakin banyak semakin cepat, tetapi semakin boros memori)", precision=0)
+                    button = gr.Button("Submit")
+                plot_component = gr.Plot(label="Pie Chart")
+            dataframe_component = gr.DataFrame(type="pandas",
+                                               label="Dataframe",
+                                               max_rows=(20,'fixed'),
+                                               overflow_row_behaviour='paginate',
+                                               wrap=True)
+    with gr.Tab("Single Tweet"):
+        gr.Interface(lambda Tweet: (predict(Tweet)[0]['label'], predict(Tweet)[0]['score']),
+                     "textbox",
+                     ["label", "label"],
+                     allow_flagging='never',
+                     )
+    gr.Markdown(
+            """
+            Space ini merupakan tugas NLP dari mata kuliah Pemrosesan Bahasa Alami yang diampu oleh Bapak Syukron Abu Ishaq Alfarozi.
+            ## Anggota Kelompok
+            - Karunia Perjuangan Mustadl'afin - 20/456368/TK/50498
+            - Pramudya Kusuma Hardika - 20/460558/TK/51147
+            """
+        )
+    button.click(keyword_analyzer,
+                inputs=[keyword_textbox, max_tweets_component, batch_size_component],
+                outputs=[plot_component, dataframe_component])
+demo.launch(inbrowser=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+snscrape
+tqdm
+transformers
+pandas
+torch
+matplotlib
+gradio
+numpy