Upload prediction.py
Browse files- prediction.py +82 -0
prediction.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.models import load_model
|
5 |
+
import re
|
6 |
+
import nltk
|
7 |
+
nltk.download('punkt')
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
nltk.download('stopwords')
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
nltk.download('wordnet')
|
12 |
+
from nltk.stem import WordNetLemmatizer
|
13 |
+
|
14 |
+
# Load the model
|
15 |
+
loaded_model = load_model('model_rnn')
|
16 |
+
|
17 |
+
# Create a dictionary to map the labels to the categories
|
18 |
+
label_dict = {0: 'Uang Masuk', 1: 'Uang Keluar', 2: 'Pinjaman', 3: 'Tagihan', 4: 'Top Up',
|
19 |
+
5: 'Biaya & Lainnya', 6: 'Transportasi', 7: 'Pendidikan', 8: 'Hadiah & Amal',
|
20 |
+
9: 'Belanja', 10: 'Hiburan',11: 'Makanan & Minuman', 12: 'Kesehatan',
|
21 |
+
13: 'Perawatan Diri', 14: 'Hobi & Gaya Hidup', 15: 'Pencairan Investasi',
|
22 |
+
16: 'Tabungan & Investasi'}
|
23 |
+
|
24 |
+
def preprocessing(text):
|
25 |
+
'''
|
26 |
+
Preprocessing text by applying lowercasing, normalization, tokenization, stopword removal, and lemmatization
|
27 |
+
'''
|
28 |
+
# Lowercase the text
|
29 |
+
text = text.lower()
|
30 |
+
|
31 |
+
# Normalize the text
|
32 |
+
text = re.sub(r'\d+', '', text) # Remove numbers
|
33 |
+
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
34 |
+
text = re.sub(r'\s+', ' ', text).strip() # Remove whitespaces
|
35 |
+
|
36 |
+
# Tokenize the text
|
37 |
+
tokens = word_tokenize(text)
|
38 |
+
|
39 |
+
# Get the English stopwords
|
40 |
+
stop_words = set(stopwords.words('indonesian'))
|
41 |
+
stop_words.update(['the', 'yg', 'gk', 'nyagak', 'pake', 'pakai', 'i', "and"])
|
42 |
+
|
43 |
+
# Remove stopwords
|
44 |
+
tokens = [word for word in tokens if word not in stop_words]
|
45 |
+
|
46 |
+
# Lemmatize the text
|
47 |
+
lemmatizer = WordNetLemmatizer()
|
48 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
49 |
+
|
50 |
+
# Combine tokens back into a single string
|
51 |
+
text = ' '.join(tokens)
|
52 |
+
|
53 |
+
return text
|
54 |
+
|
55 |
+
def run():
|
56 |
+
st.title('Notes Categorization')
|
57 |
+
|
58 |
+
default = "konser twice"
|
59 |
+
|
60 |
+
user_input = st.text_area("Enter the notes text here:", default, height=50)
|
61 |
+
|
62 |
+
if st.button('Predict'):
|
63 |
+
# Apply the function to the 'Text' column in the data
|
64 |
+
text_processed = preprocessing(user_input)
|
65 |
+
|
66 |
+
# The model expects input data in batch, even if just predicting on one sample
|
67 |
+
# So, I'll add an extra dimension with np.expand_dims
|
68 |
+
preprocessed_notes = np.expand_dims(text_processed, axis=0)
|
69 |
+
|
70 |
+
# get the prediction
|
71 |
+
predictions = loaded_model.predict(preprocessed_notes)
|
72 |
+
|
73 |
+
# get the class with the highest probability
|
74 |
+
predicted_class = np.argmax(predictions[0])
|
75 |
+
|
76 |
+
# Decode the predicted class into the original category
|
77 |
+
predicted_category = label_dict[predicted_class]
|
78 |
+
|
79 |
+
st.write(f'The predicted category is: {predicted_category}')
|
80 |
+
|
81 |
+
if __name__ == '__main__':
|
82 |
+
main()
|