File size: 14,787 Bytes
d35f894
 
 
 
 
 
 
 
 
 
4a3ec7a
 
00f20fe
d35f894
4a3ec7a
 
 
e6b9234
 
d35f894
222b841
d35f894
 
 
 
 
 
 
 
 
 
 
 
 
 
24f9f13
 
 
 
d35f894
 
24f9f13
d35f894
 
24f9f13
d35f894
 
24f9f13
d35f894
 
24f9f13
d35f894
 
24f9f13
41457e8
15b606e
 
a8fe0b7
d35f894
 
 
f45cfdc
d35f894
 
 
 
 
895d96f
c270e28
 
 
d35f894
 
 
 
 
222b841
d35f894
 
 
 
 
1d246f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d35f894
e596d85
d35f894
 
 
 
a4e3f31
d35f894
 
1d246f3
 
eaab897
 
0c9cbe9
eaab897
0c9cbe9
 
eaab897
 
 
 
 
 
 
 
 
 
 
 
d35f894
e596d85
 
d35f894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d246f3
 
 
 
 
1c314bc
 
 
1d246f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d35f894
 
 
 
 
 
 
 
 
222b841
 
 
1d246f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b9500e
5d34b59
1d246f3
 
5d34b59
1b9500e
5d34b59
1b9500e
1d246f3
 
 
d35f894
 
222b841
944ad8c
d35f894
 
 
 
 
 
 
1d246f3
 
7307017
1d246f3
 
 
 
 
 
 
222b841
d35f894
d0ba2db
d35f894
 
 
 
 
 
90c6af4
7395a16
1d246f3
7307017
1d246f3
 
 
 
 
 
 
 
 
7307017
1d246f3
 
 
 
 
 
 
222b841
 
d35f894
 
944ad8c
 
4b2e55f
944ad8c
1d246f3
222b841
 
 
 
 
926f837
1d246f3
 
222b841
1d246f3
 
 
 
 
 
 
 
 
 
 
5d34b59
1b9500e
5d34b59
 
31d2328
 
 
8c6dc8e
00f20fe
 
8c6dc8e
00f20fe
222b841
 
 
1d246f3
222b841
d35f894
002bcee
1d246f3
002bcee
222b841
002bcee
 
222b841
 
d35f894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387

import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Impor library tambahan
#import matplotlib.pyplot as plt
#import seaborn as sns
#import plotly.express as px
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
#from transformers import pipeline


# Fungsi untuk membersihkan teks dengan ekspresi reguler
@st.cache_data
def clean_text(text):
    # Tahap-1: Menghapus karakter non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Tahap-2: Menghapus URL
    text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text)
    text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text)

    # Tahap-3: Menghapus mentions
    text = re.sub(r'@[\w]+', '', text)

    # Tahap-4: Menghapus hashtag
    text = re.sub(r'#([\w]+)', '', text)

    # Tahap-5 Menghapus 'amp' yang menempel pada '&' dan 'gt' yang menempel pada '&'
    text = re.sub(r'&|>', '', text)

    # Tahap-6: Menghapus karakter khusus (simbol)
    text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text)

    # Tahap-7: Menghapus angka
    text = re.sub(r'[0-9]+', '', text)

    # Tahap-8: Menggabungkan spasi ganda menjadi satu spasi
    text = re.sub(' +', ' ', text)

    # Tahap-9: Menghapus spasi di awal dan akhir kalimat
    text = text.strip()

    # Tahap-10: Konversi teks ke huruf kecil
    text = text.lower()

    # Tahap-11: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
    # text = re.sub(r'([a-zA-Z])\1\1', '\\1', text)
    #text = re.sub(r'(.)(\1{2,})', r'\1\1', text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    return text

# Membaca kamus kata gaul Salsabila
kamus_path = '_json_colloquial-indonesian-lexicon (1).txt'  # Ganti dengan path yang benar
with open(kamus_path) as f:
    data = f.read()
lookp_dict = json.loads(data)

# Dict kata gaul saya sendiri yang tidak masuk di dict Salsabila
kamus_sendiri_path = 'kamus_gaul_custom.txt'
with open(kamus_sendiri_path) as f:
    kamus_sendiri = f.read()
kamus_gaul_baru = json.loads(kamus_sendiri)

# Menambahkan dict kata gaul baru ke kamus yang sudah ada
lookp_dict.update(kamus_gaul_baru)

# Fungsi untuk normalisasi kata gaul
@st.cache_data
def normalize_slang(text, slang_dict):
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(normalized_words)

#---------------------------------------------------NLTK Remove Stopwords----------------------------------------------------------------------

# Inisialisasi stopwords bahasa Indonesia
nltk.download("stopwords")
stop_words = set(stopwords.words("indonesian"))

def remove_stopwords(text, stop_words):
    # Pecah teks menjadi kata-kata
    words = text.split()

    # Hapus stopwords bahasa Indonesia
    words = [word for word in words if word not in stop_words]

    return " ".join(words)
#---------------------------------------------------TFIDF----------------------------------------------------------------------  
# Fungsi untuk ekstraksi fitur TF-IDF
def extract_tfidf_features(texts, tfidf_vectorizer):
    tfidf_matrix = tfidf_vectorizer.transform(texts)
    return tfidf_matrix

# Memuat model TF-IDF dengan joblib (pastikan path-nya benar)
tfidf_model_path = 'X_tfidf_model.joblib'
tfidf_vectorizer = joblib.load(tfidf_model_path)

#---------------------------------------------------Milih Model----------------------------------------------------------------------

# Fungsi untuk memilih model berdasarkan pilihan pengguna
def select_sentiment_model(selected_model):
    if selected_model == "Ensemble":
        model_path = 'ensemble_clf_soft_smote.joblib'
    elif selected_model == "Random Forest":
        model_path = 'best_rf_model_smote.joblib'
    elif selected_model == "Naive Bayes":
        model_path = 'naive_bayes_model_smote.joblib'
    elif selected_model == "Logistic Regression":
        model_path = 'logreg_model_smote.joblib'
    else:
        # Fallback ke model default jika pilihan tidak valid
        model_path = 'ensemble_clf_soft_smote.joblib'

    model = joblib.load(model_path)
    return model


# Fungsi untuk prediksi sentimen

def predict_sentiment(text, model, tfidf_vectorizer, slang_dict):
    # Tahap-1: Membersihkan dan normalisasi teks
    cleaned_text = clean_text(text)
    norm_slang_text = normalize_slang(cleaned_text, slang_dict)

    # Tahap-2: Ekstraksi fitur TF-IDF
    tfidf_matrix = tfidf_vectorizer.transform([norm_slang_text])

    # Tahap-3: Lakukan prediksi sentimen
    sentiment = model.predict(tfidf_matrix)

    # Tahap-4: Menggantikan indeks dengan label sentimen
    labels = {0: "Negatif", 1: "Netral", 2: "Positif"}
    sentiment_label = labels[int(sentiment)]

    return sentiment_label

def get_emoticon(sentiment):
    if sentiment == "Positif":
        emoticon = "πŸ˜„"  # Emotikon untuk sentimen positif
    elif sentiment == "Negatif":
        emoticon = "😞"  # Emotikon untuk sentimen negatif
    else:
        emoticon = "😐"  # Emotikon untuk sentimen netral

    return emoticon

def buat_chart(df, target_year):

    st.write(f"Bar Chart Tahun {target_year}:")

    # Ambil bulan
    df['Date'] = pd.to_datetime(df['Date'])  # Convert 'Date' column to datetime
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year

    # Filter DataFrame for the desired year
    df_filtered = df[df['year'] == target_year]

    # Check if data for the target year is available
    if df_filtered.empty:
        st.warning(f"Tidak ada data untuk tahun {target_year}.")
        return

    # Mapping nilai bulan ke nama bulan
    bulan_mapping = {
        1: f'Januari {target_year}',
        2: f'Februari {target_year}',
        3: f'Maret {target_year}',
        4: f'April {target_year}',
        5: f'Mei {target_year}',
        6: f'Juni {target_year}',
        7: f'Juli {target_year}',
        8: f'Agustus {target_year}',
        9: f'September {target_year}',
        10: f'Oktober {target_year}',
        11: f'November {target_year}',
        12: f'Desember {target_year}'
    }

    # Mengganti nilai dalam kolom 'month' menggunakan mapping
    df_filtered['month'] = df_filtered['month'].replace(bulan_mapping)

    # Menentukan warna untuk setiap kategori dalam kolom 'score'
    warna_label = {
        'Negatif': '#FF9AA2',
        'Netral': '#FFDAC1',
        'Positif': '#B5EAD7'
    }

    # Sorting unique scores
    unique_label = sorted(df_filtered['label'].unique())

    # Ensure months are in the correct order
    months_order = [
        f'Januari {target_year}', f'Februari {target_year}', f'Maret {target_year}', f'April {target_year}', f'Mei {target_year}', f'Juni {target_year}',
        f'Juli {target_year}', f'Agustus {target_year}', f'September {target_year}', f'Oktober {target_year}', f'November {target_year}', f'Desember {target_year}'
    ]

    # Sort DataFrame based on the custom order of months
    df_filtered['month'] = pd.Categorical(df_filtered['month'], categories=months_order, ordered=True)
    df_filtered = df_filtered.sort_values('month')

    # Create a bar chart with stacking and manual colors
    st.bar_chart(
        df_filtered.groupby(['month', 'label']).size().unstack().fillna(0),
        color=[warna_label[label] for label in unique_label]
    )
    
# Fungsi untuk membuat tautan unduhan
def get_table_download_link(df, download_format):
    if download_format == "XLSX":
        df.to_excel("hasil_sentimen.xlsx", index=False)
        return f'<a href="hasil_sentimen.xlsx" download="hasil_sentimen.xlsx">Unduh File XLSX</a>'
    else:
        csv = df.to_csv(index=False)
        return f'<a href="data:file/csv;base64,{b64encode(csv.encode()).decode()}" download="hasil_sentimen.csv">Unduh File CSV</a>'


# Judul
st.title("Analisis Sentimen Based on Tweets Biskita Transpakuan")
#-----------------------------------------------------General Settings---------------------------------------------------------------
with st.expander("General Settings :"):
    # Tambahkan widget untuk memilih model
    selected_model = st.selectbox("Pilih Model Sentimen:", ("Ensemble", "Naive Bayes", "Logistic Regression", "Transformer"))
    
    # Memilih model sentimen berdasarkan pilihan pengguna
    sentiment_model = select_sentiment_model(selected_model)
    
    # Pilihan input teks manual atau berkas XLSX
    input_option = st.radio("Pilih metode input:", ("Teks Manual", "Unggah Berkas XLSX"))
    
    if input_option == "Teks Manual":
        # Input teks dari pengguna
        user_input = st.text_area("Masukkan teks:", "")
    else:
        # Input berkas XLSX
        uploaded_file = st.file_uploader("Unggah berkas XLSX", type=["xlsx"])
        st.write("**Pastikan berkas XLSX Anda memiliki kolom yang bernama 'Text'.**")
    
        if uploaded_file is not None:
            df = pd.read_excel(uploaded_file)
    
            if 'Text' not in df.columns:
                st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.")
                if not df['Text'].empty:
                    st.warning("Kolom 'Text' harus mempunyai value.")
            else:
                texts = df['Text']  # Sesuaikan dengan nama kolom di berkas XLSX Anda
            if "Date" in df.columns :
                if not df['Date'].empty:
                    dates = df['Date']
                    target_year = st.selectbox("Pilih Tahun Bar Chart :", df['Date'].str[:4].unique())
#-----------------------------------------------------Preference Settings--------------------------------------------------
with st.expander ("Preference Settings :"):
    colormap = st.selectbox("Pilih Warna Wordclouds :", ["Greys", "Purples", "Blues", "Greens", "Oranges", "Reds", "YlOrBr", "YlOrRd", "OrRd", "PuRd", "RdPu", "BuPu", "GnBu", "PuBu", "YlGnBu", "PuBuGn", "BuGn", "YlGn"])
# Analisis sentimen
results = []
analisis = False
if st.button("Analysis") and input_option == "Teks Manual" and user_input:
    # Pisahkan teks yang dimasukkan pengguna menjadi baris-baris terpisah
    user_texts = user_input.split('\n')
    for text in user_texts:
        sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict)
        emoticon = get_emoticon(sentiment_label)
        cleaned_text = clean_text(text)
        norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
        tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
        
        results.append({
            'Text': text, 
            'cleaned-text' : cleaned_text, 
            'normalisasi-text' : norm_slang_text, 
            'stopwords-remove' : tanpa_stopwords,
            'label' : sentiment_label,
            'emotikon' : emoticon,
        })
        analisis = True

elif input_option == "Unggah Berkas XLSX" and uploaded_file is not None:
    if 'Text' in df.columns:
        for text in texts:
            sentiment_label = predict_sentiment(text, sentiment_model, tfidf_vectorizer, lookp_dict)
            emoticon = get_emoticon(sentiment_label)
            cleaned_text = clean_text(text)
            norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
            tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
            if 'Date' in df.columns :
                for date in dates :
                    results.append({
                        'Date' : date,
                        'Text': text, 
                        'cleaned-text' : cleaned_text, 
                        'normalisasi-text' : norm_slang_text, 
                        'stopwords-remove' : tanpa_stopwords,
                        'label' : sentiment_label,
                        'emotikon' : emoticon,
                    })
            else :
                results.append({
                    'Text': text, 
                    'cleaned-text' : cleaned_text, 
                    'normalisasi-text' : norm_slang_text, 
                    'stopwords-remove' : tanpa_stopwords,
                    'label' : sentiment_label,
                    'emotikon' : emoticon,
                })
            analisis = True
            
    else:
        st.warning("Berkas XLSX harus memiliki kolom bernama 'Text' untuk analisis sentimen.")

        
st.info('Tekan "Analysis" kembali jika tampilan menghilang', icon = 'ℹ️')
if results and analisis == True:
    df_results = pd.DataFrame(results)
    # Membagi tampilan menjadi dua kolom
    columns = st.columns(2)
    
    # Kolom pertama untuk Word Cloud
    with columns[0]:
        st.write("Wordclouds :")
        all_texts = [result['stopwords-remove'] for result in results if result['stopwords-remove'] is not None and not pd.isna(result['stopwords-remove'])]
        all_texts = " ".join(all_texts)
    
        if all_texts:
            wordcloud = WordCloud(width=800, height=660, background_color='white',
                                  colormap=colormap,   # Warna huruf
                                  contour_color='black',  # Warna kontur
                                  contour_width=2,       # Lebar kontur
                                  mask=None,             # Gunakan mask untuk bentuk kustom
                                  ).generate(all_texts)
            st.image(wordcloud.to_array())
        else:
            st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.")

    if 'Date' in df_results.columns:
        if not df_results['Date'].empty:
            with columns[1]:
                buat_chart(df_results, target_year)
    else :
        # Kolom kedua untuk Bar Chart
        with columns[1]:
            st.write("Bar Chart :")   
            # Membuat bar chart
            st.bar_chart(
                df_results["label"].value_counts()
            )
    # Menampilkan hasil analisis sentimen dalam kotak yang dapat diperluas
    with st.expander("Hasil Analisis Sentimen"):
        # Tampilkan tabel hasil analisis sentimen
        st.write(pd.DataFrame(results))
    
    if results:
        # Simpan DataFrame ke dalam file CSV
        df = pd.DataFrame(results)
        csv = df.to_csv(index=False)
    
        # Tampilkan tombol unduh CSV
        st.download_button(label="Unduh CSV", data=csv, key="csv_download", file_name="hasil_sentimen.csv")
    else:
        st.write("Tidak ada data untuk diunduh.")


# Garis pemisah
st.divider()

# Tautan ke GitHub
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")

# Tautan ke Instagram
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")

# Pesan penutup
st.write('Thank you for trying the demo!')
st.write('Best regards, Naufal Nashif')