Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import joblib | |
from preprocessing import clean_text, case_folding, tokenize, normalized_term, remove_stopwords, stem_text | |
# Dictionary untuk menyimpan model, vectorizer, dan jumlah fitur yang diinginkan | |
models = { | |
'Model': {'svm': 'svm_model.pkl', 'vectorizer': 'tfidf_prabowo_final.pkl', 'features': 'prabowo_selected.pkl'}, | |
} | |
# Pilihan model | |
selected_model = st.sidebar.selectbox('Choose Model:', list(models.keys())) | |
# Load model dan vectorizer untuk teks | |
selected_model = models[selected_model] | |
model = joblib.load(selected_model['svm']) | |
vectorizer = joblib.load(selected_model['vectorizer']) | |
features = joblib.load(selected_model['features']) | |
st.title('Website Prediksi Analisis Sentimen Calon Presiden Indonesia 2024') | |
# # Daftar kata-kata yang langsung diklasifikasikan sebagai Positif | |
positive_keywords = ['menang', 'bagus', 'keren', 'mantap', 'bijak', 'berhasil'] | |
negative_keywords = ['bangsat', 'anjing', 'hoax','tai','babi','bajingan','penipu','pembohong','telek','sialan', 'gagal','kalah','jelek'] | |
# Fungsi untuk memeriksa apakah teks mengandung kata-kata positif | |
def contains_positive_keywords(text, keywords): | |
for word in keywords: | |
if word in text: | |
return True | |
return False | |
# Fungsi untuk memeriksa apakah teks mengandung kata-kata positif | |
def contains_negative_keywords(text, keywords): | |
for word in keywords: | |
if word in text: | |
return True | |
return False | |
# Pilihan input: Teks atau File | |
option = st.sidebar.selectbox('Choose Input Option:', ['Text', 'File']) | |
if option == 'Text': | |
# Input teks | |
user_input = st.text_area('Enter Text:', '') | |
cleaned_text = clean_text(user_input) | |
folded_text = case_folding(cleaned_text) | |
tokenized_text = tokenize(folded_text) | |
normalized_text = normalized_term(tokenized_text) | |
wstopword_text = remove_stopwords(normalized_text) | |
stemmed_text = stem_text(wstopword_text) | |
done_text = ' '.join(stemmed_text) | |
if st.button('Analyze'): | |
if contains_positive_keywords(user_input, positive_keywords): | |
st.write(pd.DataFrame({'Step': ['Cleaning', 'Case Folding', 'Tokenization', 'Normalization', 'Stopword Removal', 'Stemming'], | |
'Result': [cleaned_text, folded_text, tokenized_text, normalized_text, wstopword_text, | |
stemmed_text]})) | |
st.write('Sentiment: Positif') | |
elif contains_negative_keywords(user_input, negative_keywords): | |
st.write(pd.DataFrame({'Step': ['Cleaning', 'Case Folding', 'Tokenization', 'Normalization', 'Stopword Removal', 'Stemming'], | |
'Result': [cleaned_text, folded_text, tokenized_text, normalized_text, wstopword_text, | |
stemmed_text]})) | |
st.write('Sentiment: Negatif') | |
else: | |
# Preprocessing teks | |
cleaned_text = clean_text(user_input) | |
folded_text = case_folding(cleaned_text) | |
tokenized_text = tokenize(folded_text) | |
normalized_text = normalized_term(tokenized_text) | |
wstopword_text = remove_stopwords(normalized_text) | |
stemmed_text = stem_text(wstopword_text) | |
done_text = ' '.join(stemmed_text) | |
# Tampilkan tahapan preprocessing | |
st.subheader('Preprocessing Steps:') | |
st.write(pd.DataFrame({'Step': ['Cleaning', 'Case Folding', 'Tokenization', 'Normalization', 'Stopword Removal', 'Stemming'], | |
'Result': [cleaned_text, folded_text, tokenized_text, normalized_text, wstopword_text, | |
stemmed_text]})) | |
# Proses teks dan lakukan prediksi | |
new_data_tfidf = vectorizer.transform([done_text]).toarray() | |
new_data_selected = pd.DataFrame(new_data_tfidf, columns=vectorizer.get_feature_names_out())[features].values | |
prediction = model.predict(new_data_selected) | |
st.write('Sentiment:', prediction[0]) | |
elif option == 'File': | |
# Input file Excel/CSV | |
uploaded_file = st.file_uploader('Upload Excel/CSV File:', type=['csv', 'xlsx']) | |
if uploaded_file is not None: | |
# Baca file dan lakukan prediksi | |
df = pd.read_excel(uploaded_file) if uploaded_file.name.endswith('xlsx') else pd.read_csv(uploaded_file) | |
# Preprocessing teks dalam kolom 'Text' | |
df['Cleaned'] = df['text'].apply(clean_text) | |
df['Case Folded'] = df['Cleaned'].apply(case_folding) | |
df['Tokenized'] = df['Case Folded'].apply(tokenize) | |
df['Normalized'] = df['Tokenized'].apply(normalized_term) | |
df['Stopword Removal'] = df['Normalized'].apply(remove_stopwords) | |
df['Stemmed'] = df['Stopword Removal'].apply(stem_text) | |
df['Text Final'] = df['Stemmed'].apply(lambda words: ' '.join(words)) | |
# Proses teks dan lakukan prediksi | |
# Jumlah fitur yang diinginkan sesuai dengan model SVM | |
new_data_tfidf = vectorizer.transform(df['Text Final']).toarray() | |
new_data_selected = pd.DataFrame(new_data_tfidf, columns=vectorizer.get_feature_names_out())[features].values | |
predictions = [] | |
# for text in df['text']: | |
# if contains_positive_keywords(text, positive_keywords): | |
# predictions.append('Positif') | |
# else: | |
# new_data_selected_row = new_data_selected[df['text'].tolist().index(text)].reshape(1, -1) | |
# prediction = model.predict(new_data_selected_row) | |
# predictions.append(prediction[0]) | |
df['Sentiment'] = predictions | |
# Tampilkan jumlah label positif dan negatif | |
st.subheader('Sentiment Distribution:') | |
st.write(df['Sentiment'].value_counts()) | |
# Tampilkan tahapan preprocessing | |
st.subheader('Preprocessing Steps:') | |
st.write(df) |