Spaces:
Sleeping
Sleeping
Commit
·
36395bb
1
Parent(s):
6cf1921
Upload 3 files
Browse files- app.py +90 -65
- test_data.csv +0 -0
- train_data.csv +3 -0
app.py
CHANGED
@@ -1,65 +1,90 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import pandas as pd
|
3 |
-
import
|
4 |
-
import
|
5 |
-
import
|
6 |
-
from nltk.
|
7 |
-
from
|
8 |
-
from
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
#
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
)
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
from nltk.stem import PorterStemmer
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.stem import WordNetLemmatizer
|
9 |
+
from collections import Counter
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
lemmatizer = WordNetLemmatizer()
|
12 |
+
stemmer = PorterStemmer()
|
13 |
+
import re
|
14 |
+
import nltk
|
15 |
+
import joblib
|
16 |
+
from nltk.corpus import stopwords
|
17 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
18 |
+
from sklearn.linear_model import LogisticRegression
|
19 |
+
|
20 |
+
|
21 |
+
# Download stopwords if not available
|
22 |
+
nltk.download('stopwords')
|
23 |
+
custom_stopwords = set(stopwords.words('english'))
|
24 |
+
# Paths to datasets
|
25 |
+
train_path = r"E:\Projects\Sentiment Analysis Project DEPI\train_data.csv"
|
26 |
+
test_path = r"E:\Projects\Sentiment Analysis Project DEPI\test_data.csv"
|
27 |
+
|
28 |
+
# Load datasets
|
29 |
+
train_df = pd.read_csv(train_path)
|
30 |
+
|
31 |
+
# Text preprocessing function
|
32 |
+
def preprocess_text(text):
|
33 |
+
if isinstance(text, str): # Ensure text is a string
|
34 |
+
## REMOVE HTML
|
35 |
+
if "<" in text and ">" in text:
|
36 |
+
text = BeautifulSoup(text, "html.parser").get_text()
|
37 |
+
## CLEANING
|
38 |
+
# Remove special characters
|
39 |
+
text = re.sub(r'\W+', ' ', text)
|
40 |
+
# Remove digits
|
41 |
+
text = re.sub(r'\d+', '', text)
|
42 |
+
## LOWERCASING
|
43 |
+
text = text.lower()
|
44 |
+
## TOKENIZATION
|
45 |
+
words = text.split()
|
46 |
+
## REMOVE STOPWORDS
|
47 |
+
words = [w for w in words if w not in custom_stopwords]
|
48 |
+
## APPLY LEMMATIZATION
|
49 |
+
words = [lemmatizer.lemmatize(w) for w in words]
|
50 |
+
## RETURN CLEANED TEXT
|
51 |
+
return ' '.join(words)
|
52 |
+
return ""
|
53 |
+
|
54 |
+
# Apply preprocessing
|
55 |
+
train_df['cleaned_review'] = train_df['review'].astype(str).apply(preprocess_text)
|
56 |
+
|
57 |
+
# Train the model
|
58 |
+
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
|
59 |
+
X_train_tfidf = vectorizer.fit_transform(train_df['cleaned_review'])
|
60 |
+
y_train = train_df['sentiment']
|
61 |
+
|
62 |
+
model = LogisticRegression(max_iter=500)
|
63 |
+
model.fit(X_train_tfidf, y_train)
|
64 |
+
|
65 |
+
# Save the model and vectorizer
|
66 |
+
joblib.dump(model, "sentiment_model.pkl")
|
67 |
+
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
|
68 |
+
|
69 |
+
# Load model and vectorizer for prediction
|
70 |
+
model = joblib.load("sentiment_model.pkl")
|
71 |
+
vectorizer = joblib.load("tfidf_vectorizer.pkl")
|
72 |
+
|
73 |
+
# Gradio prediction function
|
74 |
+
def predict_sentiment(review):
|
75 |
+
processed_review = preprocess_text(review) # Preprocess input
|
76 |
+
review_tfidf = vectorizer.transform([processed_review]) # Convert to TF-IDF
|
77 |
+
prediction = model.predict(review_tfidf)[0] # Get prediction
|
78 |
+
return f"Predicted Sentiment: {prediction}"
|
79 |
+
|
80 |
+
# Gradio UI
|
81 |
+
interface = gr.Interface(
|
82 |
+
fn=predict_sentiment,
|
83 |
+
inputs=gr.Textbox(label="Enter a Review"),
|
84 |
+
outputs=gr.Textbox(label="Sentiment Prediction"),
|
85 |
+
title="Movie Review Sentiment Analysis App",
|
86 |
+
description="Enter a review, and the model will predict if it's Positive, Negative, or Neutral."
|
87 |
+
)
|
88 |
+
|
89 |
+
# Launch the app
|
90 |
+
interface.launch()
|
test_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91d028cbb0228d5f70b3175f4ae5597c11ac42de4a77df5bfbdb19f00aa5f827
|
3 |
+
size 6120687
|