didev007 commited on
Commit
3ea0cf8
1 Parent(s): 24c0483

Upload prediction.py

Browse files
Files changed (1) hide show
  1. prediction.py +82 -0
prediction.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from tensorflow.keras.models import load_model
5
+ import re
6
+ import nltk
7
+ nltk.download('punkt')
8
+ from nltk.tokenize import word_tokenize
9
+ nltk.download('stopwords')
10
+ from nltk.corpus import stopwords
11
+ nltk.download('wordnet')
12
+ from nltk.stem import WordNetLemmatizer
13
+
14
+ # Load the model
15
+ loaded_model = load_model('model_rnn')
16
+
17
+ # Create a dictionary to map the labels to the categories
18
+ label_dict = {0: 'Uang Masuk', 1: 'Uang Keluar', 2: 'Pinjaman', 3: 'Tagihan', 4: 'Top Up',
19
+ 5: 'Biaya & Lainnya', 6: 'Transportasi', 7: 'Pendidikan', 8: 'Hadiah & Amal',
20
+ 9: 'Belanja', 10: 'Hiburan',11: 'Makanan & Minuman', 12: 'Kesehatan',
21
+ 13: 'Perawatan Diri', 14: 'Hobi & Gaya Hidup', 15: 'Pencairan Investasi',
22
+ 16: 'Tabungan & Investasi'}
23
+
24
+ def preprocessing(text):
25
+ '''
26
+ Preprocessing text by applying lowercasing, normalization, tokenization, stopword removal, and lemmatization
27
+ '''
28
+ # Lowercase the text
29
+ text = text.lower()
30
+
31
+ # Normalize the text
32
+ text = re.sub(r'\d+', '', text) # Remove numbers
33
+ text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
34
+ text = re.sub(r'\s+', ' ', text).strip() # Remove whitespaces
35
+
36
+ # Tokenize the text
37
+ tokens = word_tokenize(text)
38
+
39
+ # Get the English stopwords
40
+ stop_words = set(stopwords.words('indonesian'))
41
+ stop_words.update(['the', 'yg', 'gk', 'nyagak', 'pake', 'pakai', 'i', "and"])
42
+
43
+ # Remove stopwords
44
+ tokens = [word for word in tokens if word not in stop_words]
45
+
46
+ # Lemmatize the text
47
+ lemmatizer = WordNetLemmatizer()
48
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
49
+
50
+ # Combine tokens back into a single string
51
+ text = ' '.join(tokens)
52
+
53
+ return text
54
+
55
+ def run():
56
+ st.title('Notes Categorization')
57
+
58
+ default = "konser twice"
59
+
60
+ user_input = st.text_area("Enter the notes text here:", default, height=50)
61
+
62
+ if st.button('Predict'):
63
+ # Apply the function to the 'Text' column in the data
64
+ text_processed = preprocessing(user_input)
65
+
66
+ # The model expects input data in batch, even if just predicting on one sample
67
+ # So, I'll add an extra dimension with np.expand_dims
68
+ preprocessed_notes = np.expand_dims(text_processed, axis=0)
69
+
70
+ # get the prediction
71
+ predictions = loaded_model.predict(preprocessed_notes)
72
+
73
+ # get the class with the highest probability
74
+ predicted_class = np.argmax(predictions[0])
75
+
76
+ # Decode the predicted class into the original category
77
+ predicted_category = label_dict[predicted_class]
78
+
79
+ st.write(f'The predicted category is: {predicted_category}')
80
+
81
+ if __name__ == '__main__':
82
+ main()