karuniaperjuangan commited on
Commit
83aaeb2
β€’
1 Parent(s): e72b51f

Upload 4 files

Browse files
Files changed (4) hide show
  1. LICENSE +21 -0
  2. README.md +17 -12
  3. app.py +166 -0
  4. requirements.txt +8 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 karuniaperjuangan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,17 @@
1
- ---
2
- title: Sentiment Analysis
3
- emoji: πŸ’©
4
- colorFrom: gray
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 3.4.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
1
+ # Aplikasi Sentiment Analysis Twitter <br>
2
+
3
+ Anggota Kelompok : <br>
4
+ Karunia Perjuangan Mustadl'afin - 20/456368/TK/50498 <br>
5
+ Pramudya Kusuma Hardika - 20/460558/TK/51147 <br>
6
+
7
+ Aplikasi ini adalah aplikasi Sentiment Analysis yang bisa digunakan untuk melihat suatu tren yang ada di Twitter. Tahapan yang digunakan untuk melakukan Sentiment Analysis adalah:
8
+
9
+ 1. Melakukan Scraping tweet-tweet yang sesuai dengan keyword yang diinputkan menggunakan SNScrape
10
+ 2. Melakukan Sentiment Analysis setiap tweet dengan bantuan model DistilBERT yang sudah difinetune dengan menggunakan dataset SMSA IndoNLU
11
+ 3. Membuat Plot Pie Chart Tren Sentiment Analysis
12
+
13
+ Alasan kenapa kami memilih Twitter sebagai tempat analisis sentiment adalah kecepatan updatenya suatu isu di Twitter yang mendahului jejaring sosial lain seperti Facebook dan LinkedIn.
14
+
15
+ Link Slides Presentasi : <br>
16
+ https://www.canva.com/design/DAFOdCxwqwQ/83ebYKLdRWSn2DiBH_9HTQ/edit?utm_content=DAFOdCxwqwQ&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton
17
+
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr # Untuk UI
2
+ from transformers import pipeline
3
+ import pandas as pd
4
+ from torch.utils.data import Dataset, DataLoader
5
+ import torch
6
+ import gc
7
+ import re
8
+ from tqdm import tqdm
9
+ import matplotlib.pyplot as plt
10
+ import snscrape.modules.twitter as sntwitter
11
+ import datetime as dt
12
+ import sys
13
+ import os
14
+
15
+ def scrape_tweets(query, max_tweets=-1,output_path="./scraper/output/" ):
16
+ if not os.path.exists(output_path):
17
+ os.makedirs(output_path)
18
+ output_path = os.path.join(output_path,dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+"-"+str(query)+".csv")
19
+
20
+ tweets_list = []
21
+ if sys.version_info.minor>=8:
22
+ try:
23
+ for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
24
+ if max_tweets != -1 and i >= int(max_tweets):
25
+ break
26
+ tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.likeCount, tweet.retweetCount, tweet.replyCount, tweet.quoteCount, tweet.url, tweet.lang])
27
+ except KeyboardInterrupt:
28
+ print("Scraping berhenti atas permintaan pengguna")
29
+
30
+
31
+ df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Likes', 'Retweets', 'Replies', 'Quotes', 'URL', 'Language'])
32
+ print("Tweet berbahasa Indonesia :",len(df[df["Language"] == "in"]),"/",len(tweets_list))
33
+ df = df[df["Language"] == "in"]
34
+ #Karena Google Colab menggunakan versi 3.7, library scrape yang digunakan adalah versi lawas yang tidak lengkap, sehingga kita tidak bisa melakukan filter bahasa Indonesia
35
+ else:
36
+ print("Using older version of Python")
37
+ try:
38
+ for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
39
+ if max_tweets != -1 and i >= int(max_tweets):
40
+ break
41
+ tweets_list.append([tweet.date, tweet.id, tweet.content])
42
+ except KeyboardInterrupt:
43
+ print("Scraping berhenti atas permintaan pengguna")
44
+ df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text'])
45
+
46
+ df.to_csv(output_path, index=False)
47
+ print("Data tweet tersimpan di",output_path)
48
+ return df
49
+
50
+ def remove_unnecessary_char(text):
51
+ text = re.sub("\[USERNAME\]", " ", text)
52
+ text = re.sub("\[URL\]", " ", text)
53
+ text = re.sub("\[SENSITIVE-NO\]", " ", text)
54
+ text = re.sub(' +', ' ', text)
55
+ return text
56
+
57
+ def preprocess_tweet(text):
58
+ text = re.sub('\n',' ',text) # Remove every '\n'
59
+ # text = re.sub('rt',' ',text) # Remove every retweet symbol
60
+ text = re.sub('^(\@\w+ ?)+',' ',text)
61
+ text = re.sub(r'\@\w+',' ',text) # Remove every username
62
+ text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
63
+ text = re.sub('/', ' ', text)
64
+ # text = re.sub(r'[^\w\s]', '', text)
65
+ text = re.sub(' +', ' ', text) # Remove extra spaces
66
+ return text
67
+
68
+ def remove_nonaplhanumeric(text):
69
+ text = re.sub('[^0-9a-zA-Z]+', ' ', text)
70
+ return text
71
+
72
+ def preprocess_text(text):
73
+ text = preprocess_tweet(text)
74
+ text = remove_unnecessary_char(text)
75
+ text = remove_nonaplhanumeric(text)
76
+ text = text.lower()
77
+ return text
78
+
79
+ predict = pipeline('text-classification',
80
+ model='karuniaperjuangan/smsa-distilbert-indo',
81
+ device=0 if torch.cuda.is_available() else -1)
82
+
83
+ def analyze_df_sentiment(df, batch_size):
84
+ text_list = list(df["Text"].astype(str).values)
85
+ text_list_batches = [text_list[i:i+batch_size] for i in range(0,len(text_list),batch_size)] # Memisahkan berdasar batch size dengan bantuan zip ()
86
+
87
+ predictions = []
88
+ for batch in tqdm(text_list_batches):
89
+ batch_predictions = predict(batch)
90
+ predictions += batch_predictions
91
+ df["Label"] = [pred["label"] for pred in predictions]
92
+ df["Score"] = [pred["score"] for pred in predictions]
93
+
94
+ return df
95
+
96
+ def keyword_analyzer(keyword, max_tweets, batch_size=16):
97
+ print("Scraping tweets...")
98
+ df = scrape_tweets(keyword, max_tweets=max_tweets)
99
+ df["Text"] = df["Text"].apply(preprocess_text)
100
+ print("Analyzing sentiment...")
101
+ df = analyze_df_sentiment(df, batch_size=batch_size)
102
+ fig = plt.figure()
103
+ df.groupby(["Label"])["Text"].count().plot.pie(autopct="%.1f%%", figsize=(6,6))
104
+ return fig, df[["Text", "Label", "Score"]]
105
+
106
+
107
+ with gr.Blocks() as demo:
108
+
109
+
110
+
111
+ gr.Markdown("""<h1 style="text-align:center">Aplikasi Sentiment Analysis Keyword Twitter </h1>""")
112
+
113
+ gr.Markdown(
114
+ """
115
+ Aplikasi ini digunakan untuk melakukan sentimen analisis terhadap data di Twitter menggunakan model DistilBERT. Terdapat 2 mode yang dapat digunakan:
116
+ 1. Trend/Keyword: Untuk melakukan analisis terhadap semua tweet yang mengandung keyword yang diinputkan
117
+ 2. Tweet: Untuk melakukan analisis terhadap sebuah tweet yang diinputkan
118
+ """
119
+
120
+ )
121
+ with gr.Tab("Trend/Keyword"):
122
+ gr.Markdown("""Masukkan keyword dan jumlah maksimum tweet yang ingin diambil""")
123
+ with gr.Blocks():
124
+ with gr.Row():
125
+ with gr.Column():
126
+ keyword_textbox = gr.Textbox(lines=1, label="Keyword")
127
+ max_tweets_component = gr.Number(value=-1, label="Tweet Maksimal yang akan discrape (-1 jika ingin mengscrape semua tweet)", precision=0)
128
+ batch_size_component = gr.Number(value=16, label="Batch Size (Semakin banyak semakin cepat, tetapi semakin boros memori)", precision=0)
129
+ button = gr.Button("Submit")
130
+
131
+ plot_component = gr.Plot(label="Pie Chart")
132
+ dataframe_component = gr.DataFrame(type="pandas",
133
+ label="Dataframe",
134
+ max_rows=(20,'fixed'),
135
+ overflow_row_behaviour='paginate',
136
+ wrap=True)
137
+
138
+
139
+
140
+ with gr.Tab("Single Tweet"):
141
+ gr.Interface(lambda Tweet: (predict(Tweet)[0]['label'], predict(Tweet)[0]['score']),
142
+ "textbox",
143
+ ["label", "label"],
144
+ allow_flagging='never',
145
+ )
146
+
147
+
148
+ gr.Markdown(
149
+ """
150
+ Space ini merupakan tugas NLP dari mata kuliah Pemrosesan Bahasa Alami yang diampu oleh Bapak Syukron Abu Ishaq Alfarozi.
151
+
152
+ ## Anggota Kelompok
153
+
154
+ - Karunia Perjuangan Mustadl'afin - 20/456368/TK/50498
155
+
156
+ - Pramudya Kusuma Hardika - 20/460558/TK/51147
157
+
158
+ """
159
+
160
+ )
161
+
162
+ button.click(keyword_analyzer,
163
+ inputs=[keyword_textbox, max_tweets_component, batch_size_component],
164
+ outputs=[plot_component, dataframe_component])
165
+
166
+ demo.launch(inbrowser=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ snscrape
2
+ tqdm
3
+ transformers
4
+ pandas
5
+ torch
6
+ matplotlib
7
+ gradio
8
+ numpy