Arifsyamil commited on
Commit
4f09e46
1 Parent(s): a5937ee

Add files via upload

Browse files
Files changed (4) hide show
  1. app.py +95 -0
  2. knn_file.py +204 -0
  3. malaya_file.py +24 -0
  4. requirements.txt +20 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Import all neccessary libraries
2
+ import streamlit as st
3
+ import re
4
+ import wikipediaapi
5
+ import malaya
6
+ import torch
7
+ import tensorflow
8
+ import pandas as pd
9
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.neighbors import KNeighborsClassifier
12
+ from sklearn.multioutput import MultiOutputClassifier
13
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
14
+ import numpy as np
15
+ import matplotlib.pyplot as plt
16
+ import os
17
+ import psutil
18
+ from malaya_file import *
19
+ from knn_file import *
20
+
21
+ #Page header, title
22
+ st.set_page_config(page_title= "Malay Named Entity Recognition (NER) Model", page_icon= ":book:", layout= "wide")
23
+ st.title(":book: Pengecaman Entiti Nama Malay (NER) model")
24
+ st.markdown("CARA MENGGUNAKAN PROGRAM")
25
+ st.markdown("1. Sila taip sebuah ayat atau teks tidak melebihi 500 karakter di ruangan bawah")
26
+ st.markdown("2. Pilih model untuk melakukan proses pengecaman entiti nama (NER) berdasarkan teks")
27
+ st.markdown("3. Klik butang 'BUAT RAMALAN' bagi memulakan program")
28
+ st.markdown("4. Paparan bagi setiap kata serta jenis entiti akan dipaparkan pada bahagian 'HASIL RAMALAN'")
29
+
30
+ #CREATE TEXT FORM
31
+ with st.form(key= 'my_form'):
32
+ global kata, btn_model, df1, df2
33
+ kata = st.text_area(label="Sila taip teks atau ayat:", max_chars= 500)
34
+
35
+ btn_model = st.radio("Pilih model untuk pengecaman entiti nama",
36
+ ("KNN", "BERT", "Tiny-BERT", "ALBERT", "Tiny-ALBERT", "XLNET", "ALXLNET", "FASTFORMER", "Tiny-FASTFORMER"))
37
+
38
+ submit_button = st.form_submit_button(label= ":arrow_right: Buat Ramalan")
39
+
40
+ if submit_button:
41
+ if re.sub(r'\s+','',kata)=='':
42
+ st.error('Ruangan teks tidak boleh kosong.')
43
+
44
+ elif re.match(r'\A\s*\w+\s*\Z', kata):
45
+ st.error("Teks atau ayat mestilah sekurang-kurangnya dua patah perkataan.")
46
+
47
+ else:
48
+ if btn_model == "KNN":
49
+ st.write("Anda pilih model : KNN")
50
+ #df1 = knn_model()
51
+ #df2 = ramal_kata(kata)
52
+ else:
53
+ st.write("Anda pilih model transformer: ", btn_model)
54
+
55
+ st.success("Butang hantar berfungsi!")
56
+
57
+ with st.container():
58
+ st.write("---")
59
+ st.header("Hasil Ramalan")
60
+ st.subheader("Ayat asal")
61
+ st.write("##")
62
+ st.write(kata)
63
+ patah = str(len(kata.split()))
64
+ st.write("Bilangan perkataan : {}".format(patah))
65
+ st.write("##")
66
+ if btn_model == 'KNN':
67
+ df = ramal_kata(kata)
68
+ df_test = df.copy()
69
+ else:
70
+ df_test = malaya_model(btn_model, kata)
71
+
72
+ entiti = sorted(df_test['entiti'].unique())
73
+ pilih = st.multiselect('Jenis entiti', entiti, entiti)
74
+ df_pilihan = df_test [ (df_test['entiti'].isin(pilih)) ]
75
+ st.table(df_pilihan.style.set_properties(**{'background-color': 'white', 'color': 'black'}))
76
+
77
+ #About model
78
+ with st.expander("About this app", expanded=True):
79
+ st.write(
80
+ """
81
+ - **Pengecaman Nama Entiti Malay** adalah sebuah aplikasi pembelajaran mesin yang dibangunkan bagi mengecam entiti pada setiap token menggunakan modul MALAYA (Husein, 2018)
82
+ - Projek ini adalah tugasan Final Year Project bagi Ijazah Sarjana Muda di UKM
83
+ - Aplikasi ini ingin menentukan model terbaik yang boleh digunakan bagi dokumen teks subjek sejarah Bahasa Melayu
84
+ - Model ini mempunyai 3 fitur utama iaitu kata asal, kata sebelum dan kata selepas. Kelas yang disasarkan ialah LOKASI, MANUSIA dan ORGANISASI
85
+ - Maklumat lanjut boleh hubungi Muhd Arif Syamil bin Mohd Rahimi melalui e-mel a177313@siswa.ukm.edu.my atau 012-7049021
86
+ """)
87
+
88
+ process = psutil.Process(os.getpid())
89
+ mem_size = str((process.memory_info().rss)) # in bytes, divide by 1 billion to GB
90
+ mem_size_mb = str((process.memory_info().rss) / 1000000)
91
+ mem_size_gb = str((process.memory_info().rss) / 1000000000)
92
+ st.write("Penggunaan memori: {} bytes or {} MB or {} GB".format(mem_size, mem_size_mb, mem_size_gb))
93
+ # Dokumen Pemasyhuran Kemerdekaan 1957 telah ditulis dalam dua bahasa iaitu bahasa Melayu yang ditulis dalam Jawi dan bahasa Inggeris - No PERSON, LOCATION, ORGANISATION
94
+ # Ketika mendarat di Lapangan Terbang Sungai Besi, tetamu kehormat telah disambut oleh Pesuruhjaya Tinggi British di Tanah Melayu, Sir Donald Charles MacGillivray dan Lady MacGillivray, Yang di-Pertuan Agong Tanah Melayu yang pertama, Tuanku Abdul Rahman diiringi Raja Permaisuri Agong dan Perdana Menteri Tanah Melayu yang pertama, Tunku Abdul Rahman.
95
+ # Kedudukan sebuah kereta yang terjunam ke dalam Sungai Maaw di Jeti Feri Tanjung Kunyit, Sibu, semalam, sudah dikenal pasti. Jurucakap Pusat Gerakan Operasi (PGO), Jabatan Bomba dan Penyelamat Malaysia (JBPM) Sarawak, berkata kedudukan Toyota Camry di dasar sungai itu dikesan anggota Pasukan Penyelamat Di Air (PPDA) yang melakukan selaman kelima, hari ini, pada jam 3.49 petang.
knn_file.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Import all neccessary libraries
2
+ import streamlit as st
3
+ import re
4
+ import wikipediaapi
5
+ import malaya
6
+ import torch
7
+ import tensorflow
8
+ import pandas as pd
9
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.neighbors import KNeighborsClassifier
12
+ from sklearn.multioutput import MultiOutputClassifier
13
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
14
+ import numpy as np
15
+ import matplotlib.pyplot as plt
16
+ import os
17
+ import psutil
18
+
19
+ #LOAD PAGE AND GET TEXT
20
+ st.cache(suppress_st_warning=True)
21
+ def find_text():
22
+ global article, link, page
23
+ mwiki = wikipediaapi.Wikipedia(language = 'ms', extract_format = wikipediaapi.ExtractFormat.WIKI)
24
+ page = mwiki.page("Pemahsyuran Kemerdekaan Tanah Melayu")
25
+ link = page.fullurl
26
+ article = page.text
27
+ namefile = "malaytext.txt"
28
+ return article, page, link
29
+
30
+ #CLEAN DATA
31
+ st.cache(suppress_st_warning=True)
32
+ def clean_data():
33
+ global clean_file
34
+ file = article
35
+ file1 = file.strip("\n")
36
+ file1 = re.sub("[=(),:;.]", "", file1)
37
+ file1 = file1.strip()
38
+ file1 = re.sub("[-']", " ", file1)
39
+ file1 = file1.strip()
40
+ file1 = file1.replace("\n", " ")
41
+ clean_file = file1
42
+ return clean_file
43
+
44
+ #USE MALAYA MODULE
45
+ st.cache(allow_output_mutation=True)
46
+ def use_malaya():
47
+ global malay_pred
48
+ q_model = malaya.entity.transformer(model1 = 'bert', quantized = True)
49
+ malay_pred = q_model.predict(clean_file)
50
+ return malay_pred
51
+
52
+ #ORGANISE DATAFRAME MODEL (NO ST.COLUMNS)
53
+ st.cache(allow_output_mutation=True)
54
+ def data_model():
55
+ global df4 #Start as LABELENCODER
56
+ df = pd.DataFrame(malay_pred)
57
+ df.columns = ['kata', 'entiti'] #1, #2
58
+ df['kata'].astype('str') #KIV
59
+ df['entiti'].astype('str')
60
+ df['nombor'] = df.reset_index().index #3
61
+ df = df.reindex(['nombor', 'kata', 'entiti'], axis = 1)
62
+
63
+ #shift(1) moves backward by 1
64
+ df['SEBELUM'] = df['kata'].shift(1) #4
65
+ #shift(-1) moves forward by 1
66
+ df['SELEPAS'] = df['kata'].shift(-1) #5
67
+ df['TAGSEBELUM'] = df['entiti'].shift(1) #6
68
+ df['TAGSELEPAS'] = df['entiti'].shift(-1) #7
69
+ df.fillna("null", inplace=True)
70
+
71
+ #Observe entity LAIN-LAIN if it is a nuisance or otherwise
72
+ df1 = df.copy()
73
+ df1.replace("time", "OTHER", inplace=True)
74
+ df1.replace("event", "OTHER", inplace=True)
75
+ df1.replace("law", "OTHER", inplace=True)
76
+ df1.replace("quantity", "OTHER", inplace=True)
77
+ df1.replace("location", "lokasi", inplace=True)
78
+ df1.replace("organization", "organisasi", inplace=True)
79
+ df1.replace("person", "manusia", inplace=True)
80
+ df1.replace("OTHER", "LAIN-LAIN", inplace=True)
81
+
82
+ #ONE HOT ENCODER for LOKASI, MANUSIA dan ORGANISASI
83
+ ohe = OneHotEncoder()
84
+ ohe_entity = ohe.fit_transform(df1[['entiti']]).toarray() #8, 9, 10, 11 Expected 4 entity type
85
+ ohe_entity1 = pd.DataFrame(ohe_entity)
86
+ df2 = df1.join(ohe_entity1)
87
+ df2.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI']
88
+
89
+ #LABEL ENCODER for 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS',
90
+ le = LabelEncoder()
91
+ le_word = le.fit_transform(df1['kata'])
92
+ le_word1 = pd.DataFrame(le_word)
93
+ df3 = df2.join(le_word1) #COLUMNS OVERLAPPED
94
+ df3.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS','LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA']
95
+ le_before = le.fit_transform(df1['SEBELUM'])
96
+ le_before1 = pd.DataFrame(le_before)
97
+ df3 = df3.join(le_before1)
98
+ df3.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA', 'LSEBELUM']
99
+ le_after = le.fit_transform(df1['SELEPAS'])
100
+ le_after1 = pd.DataFrame(le_after)
101
+ df4 = df3.join(le_after1)
102
+ df4.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA', 'LSEBELUM', 'LSELEPAS']
103
+ le_entity = le.fit_transform(df1['entiti'])
104
+ le_entity1 = pd.DataFrame(le_entity)
105
+ df4 = df4.join(le_entity1)
106
+ df4.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA', 'LSEBELUM', 'LSELEPAS', 'LENTITI']
107
+
108
+ df4['LKATA'] = df4['LKATA'].astype(str)
109
+ df4['LSEBELUM'] = df4['LSEBELUM'].astype(str)
110
+ df4['LSELEPAS'] = df4['LSELEPAS'].astype(str)
111
+ df4['LAIN-LAIN'] = df4['LAIN-LAIN'].astype(int)
112
+ df4['LOKASI'] = df4['LOKASI'].astype(int)
113
+ df4['ORGANISASI'] = df4['ORGANISASI'].astype(int)
114
+ df4['MANUSIA'] = df4['MANUSIA'].astype(int)
115
+ return df4
116
+
117
+ #TRAIN MODEL USING KNN, MULTIOUTPUTCLASSIFIER
118
+ st.cache(allow_output_mutation=True)
119
+ def train_model():
120
+ global x, y, y_test, y_pred, knn, classifier, model_score
121
+ x = df4.iloc[:, [11,12,13]]
122
+ y = df4.iloc[:,[8,9,10]]
123
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 42, stratify = y)
124
+ knn = KNeighborsClassifier(n_neighbors= 3) #default 1st time k = 3, but entity type = 4
125
+ knn.fit(x_train, y_train)
126
+ classifier = MultiOutputClassifier(knn, n_jobs = -1)
127
+ classifier.fit(x_train, y_train)
128
+ #datax_test = x_test.values
129
+ datay_test = y_test.values
130
+ y_pred = classifier.predict(x_test)
131
+ model_score = classifier.score(datay_test, y_pred)
132
+ return x, y, y_test, y_pred, classifier, model_score
133
+
134
+ #EVALUATE MODEL
135
+ st.cache(allow_output_mutation=True)
136
+ def evaluate_model():
137
+ global cm, cr, accuracy
138
+ y_test1 = y_test.to_numpy().flatten()
139
+ y_pred1 = y_pred.flatten()
140
+ cm = confusion_matrix(y_test1, y_pred1)
141
+ cr = classification_report(y_test1, y_pred1)
142
+ accuracy = accuracy_score(y_test1, y_pred1)
143
+ return cm, cr, accuracy
144
+
145
+ #LOAD MODEL
146
+ st.cache(allow_output_mutation=True)
147
+ def knn_model():
148
+ result1 = find_text()
149
+ result2 = clean_data()
150
+ result3 = use_malaya()
151
+ result4 = data_model()
152
+ result5 = train_model()
153
+ result6 = evaluate_model()
154
+ return result1, result2, result3, result4, result5, result6
155
+
156
+ #PREDICT WORD OUTSIDE DATA
157
+ st.cache(allow_output_mutation=True)
158
+ def ramal_kata(kata):
159
+ string = re.sub("[=(),:;.]", "", kata)
160
+ string1 = string.split(" ")
161
+ string2 = pd.DataFrame(string1, columns = ["LKATA"])
162
+ string2['LSEBELUM'] = string2['LKATA'].shift(1)
163
+ string2['LSELEPAS'] = string2['LKATA'].shift(-1)
164
+ string2.fillna("null", inplace=True)
165
+ #string1
166
+ #st.table(string1[:10])
167
+ lbl = LabelEncoder()
168
+ lbl_sen = lbl.fit_transform(string2['LKATA'])
169
+ lbl_bef = lbl.fit_transform(string2['LSEBELUM'])
170
+ lbl_aft = lbl.fit_transform(string2['LSELEPAS'])
171
+ string2 = pd.DataFrame({'LKATA':lbl_sen, 'LSEBELUM': lbl_bef, 'LSELEPAS' : lbl_aft})
172
+ #st.dataframe(string2.head())
173
+
174
+ #Train, test model
175
+ pred_outdata = knn_model()
176
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 42, stratify = y)
177
+ pred_knn = KNeighborsClassifier(n_neighbors= 3)
178
+ #"classifier" VARIABLE from "TEST MODEL USING TESTING DATA"
179
+ kelas = MultiOutputClassifier(pred_knn, n_jobs = -1)
180
+ kelas.fit(x_train, y_train)
181
+ hasil = kelas.predict(string2)
182
+ #st.write(hasil)
183
+
184
+ fin = []
185
+ for z in hasil:
186
+ if (z == [1, 0, 0]).all():
187
+ fin.append("LOKASI")
188
+ elif (z == [0, 1, 0]).all():
189
+ fin.append("MANUSIA")
190
+ elif (z == [0, 0, 1]).all():
191
+ fin.append("ORGANISASI")
192
+ else:
193
+ fin.append("LAIN-LAIN")
194
+
195
+ #st.write(fin)
196
+ global perkata, output
197
+ perkata = [(key, value) for i, (key, value) in enumerate(zip(string1, fin))]
198
+ output = pd.DataFrame({"kata" : string1, "entiti" : fin})
199
+ #st.dataframe(output.transpose())
200
+ return output
201
+
202
+ def get_data():
203
+ ts = output
204
+ return ts
malaya_file.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Import all neccessary libraries
2
+ import streamlit as st
3
+ import re
4
+ import wikipediaapi
5
+ import malaya
6
+ import torch
7
+ import tensorflow
8
+ import pandas as pd
9
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.neighbors import KNeighborsClassifier
12
+ from sklearn.multioutput import MultiOutputClassifier
13
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
14
+ import numpy as np
15
+ import matplotlib.pyplot as plt
16
+ import tracemalloc
17
+
18
+ st.cache(allow_output_mutation=True)
19
+ def malaya_model(model_name, kata):
20
+ global df_malaya
21
+ q_model = malaya.entity.transformer(model = model_name, quantized = True)
22
+ malay_pred = q_model.predict(kata)
23
+ df_malaya = pd.DataFrame(malay_pred, columns = ['kata', 'entiti'])
24
+ return df_malaya
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ malaya<=5.0
2
+ matplotlib<=3.6.3
3
+ memory_profiler<=0.61.0
4
+ numpy<=1.24.1
5
+ pandas<=1.5.2
6
+ psutil<=5.9.4
7
+ scikit_learn<=1.2.0
8
+ streamlit<=1.17.0
9
+ tensorflow<=2.11.0
10
+ tensorboard<=2.11.2
11
+ tensorboard-data-server<=0.6.1
12
+ tensorboard-plugin-wit<=1.8.1
13
+ tensorflow-addons<=0.19.0
14
+ tensorflow-cpu<=2.11.0
15
+ tensorflow-estimator<=2.11.0
16
+ tensorflow-io-gcs-filesystem<=0.29.0
17
+ torch<=1.13.1
18
+ torchaudio<=0.13.1
19
+ torchvision<=0.14.1
20
+ Wikipedia_API<=0.5.8