evanderin commited on
Commit
662db6f
1 Parent(s): 86c8774

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. app.py +29 -0
  3. eda.py +67 -0
  4. eda_preprocessing.csv +3 -0
  5. prediction.py +115 -0
  6. requirements.txt +10 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  best_model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  best_model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
36
+ eda_preprocessing.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import eda
3
+ import prediction
4
+
5
+ # Set Config dan icon
6
+ st.set_page_config(
7
+ page_title='Churn Prediction',
8
+ layout='wide',
9
+ )
10
+
11
+ # Hide Streamlit Style
12
+ hide_streamlit_style = """
13
+ <style>
14
+ #MainMenu {visibility: hidden;}
15
+ footer {visibility: hidden;}
16
+ </style>
17
+ """
18
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
19
+
20
+ # Membuat navigasi
21
+ st.sidebar.markdown("# Evan Derin Ihsanudin - RMT-FTDS-17")
22
+ navigation = st.sidebar.selectbox('Pilih Halaman (Tweet Prediction/EDA): ', ('Tweet Prediction','Exploratory Data Analysis'))
23
+ st.sidebar.image("https://imgur.com/MmPULSL.png", use_column_width=True)
24
+
25
+ # Run modul dengan if else
26
+ if navigation == 'Tweet Prediction' :
27
+ prediction.run()
28
+ else :
29
+ eda.run()
eda.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ import plotly.express as px
6
+ from PIL import Image
7
+
8
+ def run() :
9
+ # Membuat Title
10
+ st.markdown("<h1 style='text-align: center;'>Exploratory Data Analysis</h1>", unsafe_allow_html=True)
11
+ st.write('Berikut adalah EDA dan Workcloud dari Setiap Kategori Tweet')
12
+
13
+ # Import DF
14
+ df_eda = pd.read_csv('eda_preprocessing.csv')
15
+
16
+
17
+ # Membuat Sub Header
18
+ st.subheader('**Persebaran Kategori Tweet**')
19
+
20
+ # Membuat visualisasi Distribusi Tweet
21
+ fig, ax =plt.subplots(1,2,figsize=(15,6))
22
+
23
+ sns.countplot(x='cyberbullying_type', data=df_eda, palette="winter", ax=ax[0])
24
+ ax[0].set_xlabel("cyberbullying_type", fontsize= 12)
25
+ ax[0].set_ylabel("# of Tweet", fontsize= 12)
26
+ fig.suptitle('Tweet Type Distribution', fontsize=18, fontweight='bold')
27
+ ax[0].set_ylim(0,10000)
28
+ ax[0].tick_params(axis='x', rotation=90)
29
+ plt.xlabel("cyberbullying_type", fontsize= 12)
30
+ plt.ylabel("# of Tweet", fontsize= 12)
31
+
32
+ for p in ax[0].patches:
33
+ ax[0].annotate("%.0f"%(p.get_height()), (p.get_x() + p.get_width() / 2,
34
+ p.get_height()+205), ha='center', va='center',fontsize = 11)
35
+
36
+ df_eda['cyberbullying_type'].value_counts().plot(kind='pie',autopct='%1.1f%%', textprops = {"fontsize":12})
37
+ ax[1].set_ylabel("% of Tweet", fontsize= 12)
38
+ st.pyplot(fig)
39
+
40
+
41
+ # Membuat Sub Header
42
+ st.subheader('**All Tweet**')
43
+ st.image('https://imgur.com/quc6ru7.png')
44
+
45
+ # Membuat Sub Header
46
+ st.subheader('**Age Tweet**')
47
+ st.image('https://imgur.com/WB2tdlJ.png')
48
+
49
+ # Membuat Sub Header
50
+ st.subheader('**Gender Tweet**')
51
+ st.image('https://imgur.com/Pd9G2k9.png')
52
+
53
+ # Membuat Sub Header
54
+ st.subheader('**Religion Tweet**')
55
+ st.image('https://imgur.com/GE8Sj39.png')
56
+
57
+
58
+ # Membuat Sub Header
59
+ st.subheader('**Other Cyberbullying Tweet**')
60
+ st.image('https://imgur.com/sr6MYGO.png')
61
+
62
+ # Membuat Sub Header
63
+ st.subheader('**Not Cyberbullying Tweet**')
64
+ st.image('https://imgur.com/iWyNSVH.png')
65
+
66
+ if __name__ == '__main__':
67
+ run()
eda_preprocessing.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:881116368469ea2fb6e6f294dcc2aeafd835e3d8feeb7d56bbe1f3e540b523a3
3
+ size 16791030
prediction.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Library Streamlit
2
+ import streamlit as st
3
+
4
+ # Library Load Model
5
+ import pandas as pd
6
+ import numpy as np
7
+ from tensorflow.keras.models import load_model
8
+
9
+ # Library Pre-Processing
10
+ from nltk.stem import WordNetLemmatizer
11
+ import nltk
12
+ import re
13
+ from nltk.corpus import stopwords
14
+ from nltk.tokenize import word_tokenize
15
+
16
+ def run() :
17
+ # Load Model
18
+ model_lstm = load_model('best_model')
19
+
20
+ # Membuat Title
21
+ st.markdown("<h1 style='text-align: center;'>Cyberbullying Tweet Prediction</h1>", unsafe_allow_html=True)
22
+
23
+ # Menambahkan Deskripsi Form
24
+ st.write('Page ini berisi model untuk memprediksi jenis Cyberbullying pada tweet')
25
+
26
+ with st.form(key= 'form_tweet'):
27
+ st.markdown('### **Tweet**')
28
+ tweet_text = st.text_input('',value= '')
29
+ submitted = st.form_submit_button('Predict')
30
+
31
+ # Additional Stopwords
32
+ additional_stopwords = ['rt', 'mkr', 'didn', 'bc', 'n', 'm',
33
+ 'im', 'll', 'y', 've', 'u', 'ur', 'don',
34
+ 'p', 't', 's', 'aren', 'kp', 'o', 'kat',
35
+ 'de', 're', 'amp', 'will', 'wa', 'e', 'like', 'andre', 'na', 're', 'lil', 'd', 'na', 'pete', 'annie', 'nikki', 'lmao', 'miley', 'wan', 'gon']
36
+
37
+ # Setting stopwords english
38
+ stpwds_eng = list(set(stopwords.words('english')))
39
+ for i in additional_stopwords:
40
+ stpwds_eng.append(i)
41
+ # Membuat Fungsi Pre-Processing Text
42
+
43
+ cleaning_pattern = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
44
+ lemmatizer = WordNetLemmatizer()
45
+
46
+ def text_proses(teks):
47
+
48
+ # Mengubah Teks ke Lowercase
49
+ teks = teks.lower()
50
+
51
+ # Menghilangkan Link
52
+ teks = re.sub(cleaning_pattern, ' ', teks)
53
+
54
+ # Menghilangkan Mention
55
+ teks = re.sub("@[A-Za-z0-9_]+", " ", teks)
56
+
57
+ # Menghilangkan Hashtag
58
+ teks = re.sub("#[A-Za-z0-9_]+", " ", teks)
59
+
60
+ # Menghilangkan \n
61
+ teks = re.sub(r"\\n", " ",teks)
62
+
63
+ # Menghilangkan kata dibawah 3 char
64
+ teks = re.sub(r'\b\w{1,3}\b', " ",teks)
65
+
66
+ # Menghilangkan Whitespace
67
+ teks = teks.strip()
68
+
69
+ # Menghilangkan yang Bukan Huruf seperti Emoji, Gamma dll
70
+ teks = re.sub("[^A-Za-z\s']", " ", teks)
71
+
72
+ # Menghilangkan double space
73
+ teks = re.sub("\s\s+" , " ", teks)
74
+
75
+ # Melakukan Tokenisasi
76
+ tokens = word_tokenize(teks)
77
+
78
+ # Menghilangkan Stopwords
79
+ teks = ' '.join([word for word in tokens if word not in stpwds_eng])
80
+
81
+ # Melakukan Lemmatizer
82
+ teks = lemmatizer.lemmatize(teks)
83
+
84
+ return teks
85
+
86
+ # Membuat Dataframe
87
+ data_inf = {
88
+ 'tweet_text' : tweet_text
89
+ }
90
+ data_inf = pd.DataFrame([data_inf])
91
+
92
+ if submitted :
93
+ # Preprocessing Data Inference
94
+ data_inf['tweet_processed'] = data_inf['tweet_text'].apply(lambda x: text_proses(x))
95
+
96
+ # Prediksi jenis tweet
97
+ y_inf_pred = np.argmax(model_lstm.predict(data_inf['tweet_processed']), axis=-1)
98
+
99
+ # Membuat fungsi untuk return result prediksi
100
+ if y_inf_pred[0] == 0:
101
+ result = 'age'
102
+ elif y_inf_pred[0] == 1:
103
+ result = 'ethnicity'
104
+ elif y_inf_pred[0] == 2:
105
+ result = 'gender'
106
+ elif y_inf_pred[0] == 3:
107
+ result = 'not_cyberbullying'
108
+ elif y_inf_pred[0] == 4:
109
+ result = 'other_cyberbullying'
110
+ else:
111
+ result = 'religion'
112
+ st.write('# Cyberbullying Prediction : ', result)
113
+
114
+ if __name__ == '__main__':
115
+ run()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ seaborn
4
+ matplotlib
5
+ tensorflow == 2.11.0
6
+ scikit-learn == 1.0.2
7
+ numpy
8
+ plotly
9
+ re == 2.2.1
10
+ nltk == 3.7