Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from tensorflow.keras.models import load_model | |
import re | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import word_tokenize | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
nltk.download('wordnet') | |
from nltk.stem import WordNetLemmatizer | |
# Load the model | |
loaded_model = load_model('model_rnn') | |
# Create a dictionary to map the labels to the categories | |
label_dict = {0: 'Uang Masuk', 1: 'Uang Keluar', 2: 'Pinjaman', 3: 'Tagihan', 4: 'Top Up', | |
5: 'Biaya & Lainnya', 6: 'Transportasi', 7: 'Pendidikan', 8: 'Hadiah & Amal', | |
9: 'Belanja', 10: 'Hiburan',11: 'Makanan & Minuman', 12: 'Kesehatan', | |
13: 'Perawatan Diri', 14: 'Hobi & Gaya Hidup', 15: 'Pencairan Investasi', | |
16: 'Tabungan & Investasi'} | |
def preprocessing(text): | |
''' | |
Preprocessing text by applying lowercasing, normalization, tokenization, stopword removal, and lemmatization | |
''' | |
# Lowercase the text | |
text = text.lower() | |
# Normalize the text | |
text = re.sub(r'\d+', '', text) # Remove numbers | |
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation | |
text = re.sub(r'\s+', ' ', text).strip() # Remove whitespaces | |
# Tokenize the text | |
tokens = word_tokenize(text) | |
# Get the English stopwords | |
stop_words = set(stopwords.words('indonesian')) | |
stop_words.update(['the', 'yg', 'gk', 'nyagak', 'pake', 'pakai', 'i', "and"]) | |
# Remove stopwords | |
tokens = [word for word in tokens if word not in stop_words] | |
# Lemmatize the text | |
lemmatizer = WordNetLemmatizer() | |
tokens = [lemmatizer.lemmatize(word) for word in tokens] | |
# Combine tokens back into a single string | |
text = ' '.join(tokens) | |
return text | |
def run(): | |
st.title('Notes Categorization') | |
default = "konser twice" | |
user_input = st.text_area("Enter the notes text here:", default, height=50) | |
if st.button('Predict'): | |
# Apply the function to the 'Text' column in the data | |
text_processed = preprocessing(user_input) | |
# The model expects input data in batch, even if just predicting on one sample | |
# So, I'll add an extra dimension with np.expand_dims | |
preprocessed_notes = np.expand_dims(text_processed, axis=0) | |
# get the prediction | |
predictions = loaded_model.predict(preprocessed_notes) | |
# get the class with the highest probability | |
predicted_class = np.argmax(predictions[0]) | |
# Decode the predicted class into the original category | |
predicted_category = label_dict[predicted_class] | |
st.write(f'The predicted category is: {predicted_category}') | |
if __name__ == '__main__': | |
main() |