imdb-reviews / a.py
DevBM's picture
Upload 6 files
571bf3f verified
raw
history blame
4.79 kB
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pickle
import matplotlib.pyplot as plt
st.title(":blue[IMDB Dataset of 50k reviews]")
@st.cache_data
def load_data():
return pd.read_csv('IMDB Dataset.csv')
if 'models' not in st.session_state:
st.session_state.models = {}
if 'vectorizer' not in st.session_state:
st.session_state.vectorizer = None
if 'accuracy' not in st.session_state:
st.session_state.accuracy = {}
if 'report' not in st.session_state:
st.session_state.report = {}
# Dataset
st.header("Dataset")
df = load_data()
with st.expander("Show Data"):
st.write(df)
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})
X = df['review']
y = df['sentiment']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=41)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
if not st.session_state.models:
st.session_state.vectorizer = TfidfVectorizer()
X_train_tfidf = st.session_state.vectorizer.fit_transform(X_train)
# models
models = {
# "SVM": SVC(kernel='linear'),
"Logistic Regression": LogisticRegression(max_iter=1000),
"Naive Bayes": MultinomialNB()
}
for name, model in models.items():
model.fit(X_train_tfidf, y_train)
st.session_state.models[name] = model
X_test_tfidf = st.session_state.vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)
st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)
st.session_state.report[name] = classification_report(y_test, y_pred)
if st.session_state.accuracy:
plt.figure(figsize=(10, 5))
plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green'])
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
st.pyplot(plt)
for name in st.session_state.report:
st.write(f"### Classification Report for {name}:")
# st.text(st.session_state.report[name])
st.dataframe(st.session_state.report[name])
st.header("Manual Tryouts",divider='orange')
# Input text from the user
user_input = st.text_area("Enter your Review", "")
if st.button("Predict"):
if user_input:
# Vectorize user input for all models
user_input_tfidf = st.session_state.vectorizer.transform([user_input])
# Predict using all models
predictions = {}
for name, model in st.session_state.models.items():
prediction = model.predict(user_input_tfidf)
predictions[name] = "Positive" if prediction[0] == 1 else "Negative"
# Display predictions for each model
st.write("Predicted Sentiment:")
for name in predictions:
st.write(f"{name}: **{predictions[name]}**")
else:
st.write("Please enter a review.")
# # Linear Regression
# st.header('Linear Regression',divider='orange')
# model = LogisticRegression()
# model.fit(X_train_tfidf, y_train)
# y_pred = model.predict(X_test_tfidf)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# filename = 'linear_regression_model.pkl'
# with open(filename, 'wb') as model_file:
# pickle.dump(model, model_file)
# st.write("Accuracy:", accuracy_score(y_test, y_pred))
# st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)
# # Naive Bayes
# st.header("Naive Bayes",divider='orange')
# model_nb = MultinomialNB()
# model_nb.fit(X_train_tfidf, y_train)
# # Evaluate the model
# y_pred = model_nb.predict(X_test_tfidf)
# st.write("Accuracy:", accuracy_score(y_test, y_pred))
# st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)
# # SVM
# st.header("Support Vector Machine")
# st.caption("Kernal type is linear.")
# model = SVC(kernel='linear') # You can also try 'rbf', 'poly', etc.
# model.fit(X_train_tfidf, y_train)
# y_pred = model.predict(X_test_tfidf)
# st.write("Accuracy:", accuracy_score(y_test, y_pred))
# st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)