Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- .gitattributes +1 -0
- app.py +29 -0
- eda.py +67 -0
- eda_preprocessing.csv +3 -0
- prediction.py +115 -0
- requirements.txt +10 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
best_model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
best_model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
36 |
+
eda_preprocessing.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import eda
|
3 |
+
import prediction
|
4 |
+
|
5 |
+
# Set Config dan icon
|
6 |
+
st.set_page_config(
|
7 |
+
page_title='Churn Prediction',
|
8 |
+
layout='wide',
|
9 |
+
)
|
10 |
+
|
11 |
+
# Hide Streamlit Style
|
12 |
+
hide_streamlit_style = """
|
13 |
+
<style>
|
14 |
+
#MainMenu {visibility: hidden;}
|
15 |
+
footer {visibility: hidden;}
|
16 |
+
</style>
|
17 |
+
"""
|
18 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
19 |
+
|
20 |
+
# Membuat navigasi
|
21 |
+
st.sidebar.markdown("# Evan Derin Ihsanudin - RMT-FTDS-17")
|
22 |
+
navigation = st.sidebar.selectbox('Pilih Halaman (Tweet Prediction/EDA): ', ('Tweet Prediction','Exploratory Data Analysis'))
|
23 |
+
st.sidebar.image("https://imgur.com/MmPULSL.png", use_column_width=True)
|
24 |
+
|
25 |
+
# Run modul dengan if else
|
26 |
+
if navigation == 'Tweet Prediction' :
|
27 |
+
prediction.run()
|
28 |
+
else :
|
29 |
+
eda.run()
|
eda.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import plotly.express as px
|
6 |
+
from PIL import Image
|
7 |
+
|
8 |
+
def run() :
|
9 |
+
# Membuat Title
|
10 |
+
st.markdown("<h1 style='text-align: center;'>Exploratory Data Analysis</h1>", unsafe_allow_html=True)
|
11 |
+
st.write('Berikut adalah EDA dan Workcloud dari Setiap Kategori Tweet')
|
12 |
+
|
13 |
+
# Import DF
|
14 |
+
df_eda = pd.read_csv('eda_preprocessing.csv')
|
15 |
+
|
16 |
+
|
17 |
+
# Membuat Sub Header
|
18 |
+
st.subheader('**Persebaran Kategori Tweet**')
|
19 |
+
|
20 |
+
# Membuat visualisasi Distribusi Tweet
|
21 |
+
fig, ax =plt.subplots(1,2,figsize=(15,6))
|
22 |
+
|
23 |
+
sns.countplot(x='cyberbullying_type', data=df_eda, palette="winter", ax=ax[0])
|
24 |
+
ax[0].set_xlabel("cyberbullying_type", fontsize= 12)
|
25 |
+
ax[0].set_ylabel("# of Tweet", fontsize= 12)
|
26 |
+
fig.suptitle('Tweet Type Distribution', fontsize=18, fontweight='bold')
|
27 |
+
ax[0].set_ylim(0,10000)
|
28 |
+
ax[0].tick_params(axis='x', rotation=90)
|
29 |
+
plt.xlabel("cyberbullying_type", fontsize= 12)
|
30 |
+
plt.ylabel("# of Tweet", fontsize= 12)
|
31 |
+
|
32 |
+
for p in ax[0].patches:
|
33 |
+
ax[0].annotate("%.0f"%(p.get_height()), (p.get_x() + p.get_width() / 2,
|
34 |
+
p.get_height()+205), ha='center', va='center',fontsize = 11)
|
35 |
+
|
36 |
+
df_eda['cyberbullying_type'].value_counts().plot(kind='pie',autopct='%1.1f%%', textprops = {"fontsize":12})
|
37 |
+
ax[1].set_ylabel("% of Tweet", fontsize= 12)
|
38 |
+
st.pyplot(fig)
|
39 |
+
|
40 |
+
|
41 |
+
# Membuat Sub Header
|
42 |
+
st.subheader('**All Tweet**')
|
43 |
+
st.image('https://imgur.com/quc6ru7.png')
|
44 |
+
|
45 |
+
# Membuat Sub Header
|
46 |
+
st.subheader('**Age Tweet**')
|
47 |
+
st.image('https://imgur.com/WB2tdlJ.png')
|
48 |
+
|
49 |
+
# Membuat Sub Header
|
50 |
+
st.subheader('**Gender Tweet**')
|
51 |
+
st.image('https://imgur.com/Pd9G2k9.png')
|
52 |
+
|
53 |
+
# Membuat Sub Header
|
54 |
+
st.subheader('**Religion Tweet**')
|
55 |
+
st.image('https://imgur.com/GE8Sj39.png')
|
56 |
+
|
57 |
+
|
58 |
+
# Membuat Sub Header
|
59 |
+
st.subheader('**Other Cyberbullying Tweet**')
|
60 |
+
st.image('https://imgur.com/sr6MYGO.png')
|
61 |
+
|
62 |
+
# Membuat Sub Header
|
63 |
+
st.subheader('**Not Cyberbullying Tweet**')
|
64 |
+
st.image('https://imgur.com/iWyNSVH.png')
|
65 |
+
|
66 |
+
if __name__ == '__main__':
|
67 |
+
run()
|
eda_preprocessing.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:881116368469ea2fb6e6f294dcc2aeafd835e3d8feeb7d56bbe1f3e540b523a3
|
3 |
+
size 16791030
|
prediction.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Library Streamlit
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
# Library Load Model
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
from tensorflow.keras.models import load_model
|
8 |
+
|
9 |
+
# Library Pre-Processing
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
import nltk
|
12 |
+
import re
|
13 |
+
from nltk.corpus import stopwords
|
14 |
+
from nltk.tokenize import word_tokenize
|
15 |
+
|
16 |
+
def run() :
|
17 |
+
# Load Model
|
18 |
+
model_lstm = load_model('best_model')
|
19 |
+
|
20 |
+
# Membuat Title
|
21 |
+
st.markdown("<h1 style='text-align: center;'>Cyberbullying Tweet Prediction</h1>", unsafe_allow_html=True)
|
22 |
+
|
23 |
+
# Menambahkan Deskripsi Form
|
24 |
+
st.write('Page ini berisi model untuk memprediksi jenis Cyberbullying pada tweet')
|
25 |
+
|
26 |
+
with st.form(key= 'form_tweet'):
|
27 |
+
st.markdown('### **Tweet**')
|
28 |
+
tweet_text = st.text_input('',value= '')
|
29 |
+
submitted = st.form_submit_button('Predict')
|
30 |
+
|
31 |
+
# Additional Stopwords
|
32 |
+
additional_stopwords = ['rt', 'mkr', 'didn', 'bc', 'n', 'm',
|
33 |
+
'im', 'll', 'y', 've', 'u', 'ur', 'don',
|
34 |
+
'p', 't', 's', 'aren', 'kp', 'o', 'kat',
|
35 |
+
'de', 're', 'amp', 'will', 'wa', 'e', 'like', 'andre', 'na', 're', 'lil', 'd', 'na', 'pete', 'annie', 'nikki', 'lmao', 'miley', 'wan', 'gon']
|
36 |
+
|
37 |
+
# Setting stopwords english
|
38 |
+
stpwds_eng = list(set(stopwords.words('english')))
|
39 |
+
for i in additional_stopwords:
|
40 |
+
stpwds_eng.append(i)
|
41 |
+
# Membuat Fungsi Pre-Processing Text
|
42 |
+
|
43 |
+
cleaning_pattern = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
|
44 |
+
lemmatizer = WordNetLemmatizer()
|
45 |
+
|
46 |
+
def text_proses(teks):
|
47 |
+
|
48 |
+
# Mengubah Teks ke Lowercase
|
49 |
+
teks = teks.lower()
|
50 |
+
|
51 |
+
# Menghilangkan Link
|
52 |
+
teks = re.sub(cleaning_pattern, ' ', teks)
|
53 |
+
|
54 |
+
# Menghilangkan Mention
|
55 |
+
teks = re.sub("@[A-Za-z0-9_]+", " ", teks)
|
56 |
+
|
57 |
+
# Menghilangkan Hashtag
|
58 |
+
teks = re.sub("#[A-Za-z0-9_]+", " ", teks)
|
59 |
+
|
60 |
+
# Menghilangkan \n
|
61 |
+
teks = re.sub(r"\\n", " ",teks)
|
62 |
+
|
63 |
+
# Menghilangkan kata dibawah 3 char
|
64 |
+
teks = re.sub(r'\b\w{1,3}\b', " ",teks)
|
65 |
+
|
66 |
+
# Menghilangkan Whitespace
|
67 |
+
teks = teks.strip()
|
68 |
+
|
69 |
+
# Menghilangkan yang Bukan Huruf seperti Emoji, Gamma dll
|
70 |
+
teks = re.sub("[^A-Za-z\s']", " ", teks)
|
71 |
+
|
72 |
+
# Menghilangkan double space
|
73 |
+
teks = re.sub("\s\s+" , " ", teks)
|
74 |
+
|
75 |
+
# Melakukan Tokenisasi
|
76 |
+
tokens = word_tokenize(teks)
|
77 |
+
|
78 |
+
# Menghilangkan Stopwords
|
79 |
+
teks = ' '.join([word for word in tokens if word not in stpwds_eng])
|
80 |
+
|
81 |
+
# Melakukan Lemmatizer
|
82 |
+
teks = lemmatizer.lemmatize(teks)
|
83 |
+
|
84 |
+
return teks
|
85 |
+
|
86 |
+
# Membuat Dataframe
|
87 |
+
data_inf = {
|
88 |
+
'tweet_text' : tweet_text
|
89 |
+
}
|
90 |
+
data_inf = pd.DataFrame([data_inf])
|
91 |
+
|
92 |
+
if submitted :
|
93 |
+
# Preprocessing Data Inference
|
94 |
+
data_inf['tweet_processed'] = data_inf['tweet_text'].apply(lambda x: text_proses(x))
|
95 |
+
|
96 |
+
# Prediksi jenis tweet
|
97 |
+
y_inf_pred = np.argmax(model_lstm.predict(data_inf['tweet_processed']), axis=-1)
|
98 |
+
|
99 |
+
# Membuat fungsi untuk return result prediksi
|
100 |
+
if y_inf_pred[0] == 0:
|
101 |
+
result = 'age'
|
102 |
+
elif y_inf_pred[0] == 1:
|
103 |
+
result = 'ethnicity'
|
104 |
+
elif y_inf_pred[0] == 2:
|
105 |
+
result = 'gender'
|
106 |
+
elif y_inf_pred[0] == 3:
|
107 |
+
result = 'not_cyberbullying'
|
108 |
+
elif y_inf_pred[0] == 4:
|
109 |
+
result = 'other_cyberbullying'
|
110 |
+
else:
|
111 |
+
result = 'religion'
|
112 |
+
st.write('# Cyberbullying Prediction : ', result)
|
113 |
+
|
114 |
+
if __name__ == '__main__':
|
115 |
+
run()
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
seaborn
|
4 |
+
matplotlib
|
5 |
+
tensorflow == 2.11.0
|
6 |
+
scikit-learn == 1.0.2
|
7 |
+
numpy
|
8 |
+
plotly
|
9 |
+
re == 2.2.1
|
10 |
+
nltk == 3.7
|