Spaces:
Sleeping
Sleeping
#Import all neccessary libraries | |
import streamlit as st | |
import re | |
import wikipediaapi | |
import malaya | |
import torch | |
import tensorflow | |
import pandas as pd | |
from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
from sklearn.model_selection import train_test_split | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.multioutput import MultiOutputClassifier | |
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import os | |
import psutil | |
#LOAD PAGE AND GET TEXT | |
st.cache(suppress_st_warning=True) | |
def find_text(): | |
global article, link, page | |
mwiki = wikipediaapi.Wikipedia(language = 'ms', extract_format = wikipediaapi.ExtractFormat.WIKI) | |
page = mwiki.page("Pemahsyuran Kemerdekaan Tanah Melayu") | |
link = page.fullurl | |
article = page.text | |
namefile = "malaytext.txt" | |
return article, page, link | |
#CLEAN DATA | |
st.cache(suppress_st_warning=True) | |
def clean_data(): | |
global clean_file | |
file = article | |
file1 = file.strip("\n") | |
file1 = re.sub("[=(),:;.]", "", file1) | |
file1 = file1.strip() | |
file1 = re.sub("[-']", " ", file1) | |
file1 = file1.strip() | |
file1 = file1.replace("\n", " ") | |
clean_file = file1 | |
return clean_file | |
#USE MALAYA MODULE | |
st.cache(allow_output_mutation=True) | |
def use_malaya(): | |
global malay_pred | |
q_model = malaya.entity.transformer(model1 = 'bert', quantized = True) | |
malay_pred = q_model.predict(clean_file) | |
return malay_pred | |
#ORGANISE DATAFRAME MODEL (NO ST.COLUMNS) | |
st.cache(allow_output_mutation=True) | |
def data_model(): | |
global df4 #Start as LABELENCODER | |
df = pd.DataFrame(malay_pred) | |
df.columns = ['kata', 'entiti'] #1, #2 | |
df['kata'].astype('str') #KIV | |
df['entiti'].astype('str') | |
df['nombor'] = df.reset_index().index #3 | |
df = df.reindex(['nombor', 'kata', 'entiti'], axis = 1) | |
#shift(1) moves backward by 1 | |
df['SEBELUM'] = df['kata'].shift(1) #4 | |
#shift(-1) moves forward by 1 | |
df['SELEPAS'] = df['kata'].shift(-1) #5 | |
df['TAGSEBELUM'] = df['entiti'].shift(1) #6 | |
df['TAGSELEPAS'] = df['entiti'].shift(-1) #7 | |
df.fillna("null", inplace=True) | |
#Observe entity LAIN-LAIN if it is a nuisance or otherwise | |
df1 = df.copy() | |
df1.replace("time", "OTHER", inplace=True) | |
df1.replace("event", "OTHER", inplace=True) | |
df1.replace("law", "OTHER", inplace=True) | |
df1.replace("quantity", "OTHER", inplace=True) | |
df1.replace("location", "lokasi", inplace=True) | |
df1.replace("organization", "organisasi", inplace=True) | |
df1.replace("person", "manusia", inplace=True) | |
df1.replace("OTHER", "LAIN-LAIN", inplace=True) | |
#ONE HOT ENCODER for LOKASI, MANUSIA dan ORGANISASI | |
ohe = OneHotEncoder() | |
ohe_entity = ohe.fit_transform(df1[['entiti']]).toarray() #8, 9, 10, 11 Expected 4 entity type | |
ohe_entity1 = pd.DataFrame(ohe_entity) | |
df2 = df1.join(ohe_entity1) | |
df2.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI'] | |
#LABEL ENCODER for 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', | |
le = LabelEncoder() | |
le_word = le.fit_transform(df1['kata']) | |
le_word1 = pd.DataFrame(le_word) | |
df3 = df2.join(le_word1) #COLUMNS OVERLAPPED | |
df3.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS','LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA'] | |
le_before = le.fit_transform(df1['SEBELUM']) | |
le_before1 = pd.DataFrame(le_before) | |
df3 = df3.join(le_before1) | |
df3.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA', 'LSEBELUM'] | |
le_after = le.fit_transform(df1['SELEPAS']) | |
le_after1 = pd.DataFrame(le_after) | |
df4 = df3.join(le_after1) | |
df4.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA', 'LSEBELUM', 'LSELEPAS'] | |
le_entity = le.fit_transform(df1['entiti']) | |
le_entity1 = pd.DataFrame(le_entity) | |
df4 = df4.join(le_entity1) | |
df4.columns = ['nombor', 'kata', 'entiti', 'SEBELUM', 'SELEPAS', 'TAGSEBELUM', 'TAGSELEPAS', 'LAIN-LAIN', 'LOKASI', 'MANUSIA', 'ORGANISASI', 'LKATA', 'LSEBELUM', 'LSELEPAS', 'LENTITI'] | |
df4['LKATA'] = df4['LKATA'].astype(str) | |
df4['LSEBELUM'] = df4['LSEBELUM'].astype(str) | |
df4['LSELEPAS'] = df4['LSELEPAS'].astype(str) | |
df4['LAIN-LAIN'] = df4['LAIN-LAIN'].astype(int) | |
df4['LOKASI'] = df4['LOKASI'].astype(int) | |
df4['ORGANISASI'] = df4['ORGANISASI'].astype(int) | |
df4['MANUSIA'] = df4['MANUSIA'].astype(int) | |
return df4 | |
#TRAIN MODEL USING KNN, MULTIOUTPUTCLASSIFIER | |
st.cache(allow_output_mutation=True) | |
def train_model(): | |
global x, y, y_test, y_pred, knn, classifier, model_score | |
x = df4.iloc[:, [11,12,13]] | |
y = df4.iloc[:,[8,9,10]] | |
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 42, stratify = y) | |
knn = KNeighborsClassifier(n_neighbors= 3) #default 1st time k = 3, but entity type = 4 | |
knn.fit(x_train, y_train) | |
classifier = MultiOutputClassifier(knn, n_jobs = -1) | |
classifier.fit(x_train, y_train) | |
#datax_test = x_test.values | |
datay_test = y_test.values | |
y_pred = classifier.predict(x_test) | |
model_score = classifier.score(datay_test, y_pred) | |
return x, y, y_test, y_pred, classifier, model_score | |
#EVALUATE MODEL | |
st.cache(allow_output_mutation=True) | |
def evaluate_model(): | |
global cm, cr, accuracy | |
y_test1 = y_test.to_numpy().flatten() | |
y_pred1 = y_pred.flatten() | |
cm = confusion_matrix(y_test1, y_pred1) | |
cr = classification_report(y_test1, y_pred1) | |
accuracy = accuracy_score(y_test1, y_pred1) | |
return cm, cr, accuracy | |
#LOAD MODEL | |
st.cache(allow_output_mutation=True) | |
def knn_model(): | |
result1 = find_text() | |
result2 = clean_data() | |
result3 = use_malaya() | |
result4 = data_model() | |
result5 = train_model() | |
result6 = evaluate_model() | |
return result1, result2, result3, result4, result5, result6 | |
#PREDICT WORD OUTSIDE DATA | |
st.cache(allow_output_mutation=True) | |
def ramal_kata(kata): | |
string = re.sub("[=(),:;.]", "", kata) | |
string1 = string.split(" ") | |
string2 = pd.DataFrame(string1, columns = ["LKATA"]) | |
string2['LSEBELUM'] = string2['LKATA'].shift(1) | |
string2['LSELEPAS'] = string2['LKATA'].shift(-1) | |
string2.fillna("null", inplace=True) | |
#string1 | |
#st.table(string1[:10]) | |
lbl = LabelEncoder() | |
lbl_sen = lbl.fit_transform(string2['LKATA']) | |
lbl_bef = lbl.fit_transform(string2['LSEBELUM']) | |
lbl_aft = lbl.fit_transform(string2['LSELEPAS']) | |
string2 = pd.DataFrame({'LKATA':lbl_sen, 'LSEBELUM': lbl_bef, 'LSELEPAS' : lbl_aft}) | |
#st.dataframe(string2.head()) | |
#Train, test model | |
pred_outdata = knn_model() | |
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 42, stratify = y) | |
pred_knn = KNeighborsClassifier(n_neighbors= 3) | |
#"classifier" VARIABLE from "TEST MODEL USING TESTING DATA" | |
kelas = MultiOutputClassifier(pred_knn, n_jobs = -1) | |
kelas.fit(x_train, y_train) | |
hasil = kelas.predict(string2) | |
#st.write(hasil) | |
fin = [] | |
for z in hasil: | |
if (z == [1, 0, 0]).all(): | |
fin.append("LOKASI") | |
elif (z == [0, 1, 0]).all(): | |
fin.append("MANUSIA") | |
elif (z == [0, 0, 1]).all(): | |
fin.append("ORGANISASI") | |
else: | |
fin.append("LAIN-LAIN") | |
#st.write(fin) | |
global perkata, output | |
perkata = [(key, value) for i, (key, value) in enumerate(zip(string1, fin))] | |
output = pd.DataFrame({"kata" : string1, "entiti" : fin}) | |
#st.dataframe(output.transpose()) | |
return output | |
def get_data(): | |
ts = output | |
return ts | |