ahmed-ayman commited on
Commit
36395bb
·
1 Parent(s): 6cf1921

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +90 -65
  2. test_data.csv +0 -0
  3. train_data.csv +3 -0
app.py CHANGED
@@ -1,65 +1,90 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import re
4
- import nltk
5
- import joblib
6
- from nltk.corpus import stopwords
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.linear_model import LogisticRegression
9
-
10
- # Download stopwords if not available
11
- nltk.download('stopwords')
12
-
13
- # Paths to datasets
14
- train_path = r"train_data.csv"
15
- test_path = r"test_data.csv"
16
-
17
- # Load datasets
18
- train_df = pd.read_csv(train_path)
19
-
20
- # Text preprocessing function
21
- def preprocess_text(text):
22
- text = text.lower() # Convert to lowercase
23
- text = re.sub(r'\W', ' ', text) # Remove special characters
24
- text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
25
- stop_words = set(stopwords.words('english')) # Load stopwords
26
- text = ' '.join(word for word in text.split() if word not in stop_words) # Remove stopwords
27
- return text
28
-
29
- # Apply preprocessing
30
- train_df['cleaned_review'] = train_df['review'].astype(str).apply(preprocess_text)
31
-
32
- # Train the model
33
- vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
34
- X_train_tfidf = vectorizer.fit_transform(train_df['cleaned_review'])
35
- y_train = train_df['sentiment']
36
-
37
- model = LogisticRegression(max_iter=500)
38
- model.fit(X_train_tfidf, y_train)
39
-
40
- # Save the model and vectorizer
41
- joblib.dump(model, "sentiment_model.pkl")
42
- joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
43
-
44
- # Load model and vectorizer for prediction
45
- model = joblib.load("sentiment_model.pkl")
46
- vectorizer = joblib.load("tfidf_vectorizer.pkl")
47
-
48
- # Gradio prediction function
49
- def predict_sentiment(review):
50
- processed_review = preprocess_text(review) # Preprocess input
51
- review_tfidf = vectorizer.transform([processed_review]) # Convert to TF-IDF
52
- prediction = model.predict(review_tfidf)[0] # Get prediction
53
- return f"Predicted Sentiment: {prediction}"
54
-
55
- # Gradio UI
56
- interface = gr.Interface(
57
- fn=predict_sentiment,
58
- inputs=gr.Textbox(label="Enter a Review"),
59
- outputs=gr.Textbox(label="Sentiment Prediction"),
60
- title=" Movie Review Sentiment Analysis App ",
61
- description="Enter a review, and the model will predict if it's Positive, Negative, or Neutral."
62
- )
63
-
64
- # Launch the app
65
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from bs4 import BeautifulSoup
5
+ from sklearn.model_selection import train_test_split
6
+ from nltk.stem import PorterStemmer
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem import WordNetLemmatizer
9
+ from collections import Counter
10
+ from sklearn.model_selection import train_test_split
11
+ lemmatizer = WordNetLemmatizer()
12
+ stemmer = PorterStemmer()
13
+ import re
14
+ import nltk
15
+ import joblib
16
+ from nltk.corpus import stopwords
17
+ from sklearn.feature_extraction.text import TfidfVectorizer
18
+ from sklearn.linear_model import LogisticRegression
19
+
20
+
21
+ # Download stopwords if not available
22
+ nltk.download('stopwords')
23
+ custom_stopwords = set(stopwords.words('english'))
24
+ # Paths to datasets
25
+ train_path = r"E:\Projects\Sentiment Analysis Project DEPI\train_data.csv"
26
+ test_path = r"E:\Projects\Sentiment Analysis Project DEPI\test_data.csv"
27
+
28
+ # Load datasets
29
+ train_df = pd.read_csv(train_path)
30
+
31
+ # Text preprocessing function
32
+ def preprocess_text(text):
33
+ if isinstance(text, str): # Ensure text is a string
34
+ ## REMOVE HTML
35
+ if "<" in text and ">" in text:
36
+ text = BeautifulSoup(text, "html.parser").get_text()
37
+ ## CLEANING
38
+ # Remove special characters
39
+ text = re.sub(r'\W+', ' ', text)
40
+ # Remove digits
41
+ text = re.sub(r'\d+', '', text)
42
+ ## LOWERCASING
43
+ text = text.lower()
44
+ ## TOKENIZATION
45
+ words = text.split()
46
+ ## REMOVE STOPWORDS
47
+ words = [w for w in words if w not in custom_stopwords]
48
+ ## APPLY LEMMATIZATION
49
+ words = [lemmatizer.lemmatize(w) for w in words]
50
+ ## RETURN CLEANED TEXT
51
+ return ' '.join(words)
52
+ return ""
53
+
54
+ # Apply preprocessing
55
+ train_df['cleaned_review'] = train_df['review'].astype(str).apply(preprocess_text)
56
+
57
+ # Train the model
58
+ vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
59
+ X_train_tfidf = vectorizer.fit_transform(train_df['cleaned_review'])
60
+ y_train = train_df['sentiment']
61
+
62
+ model = LogisticRegression(max_iter=500)
63
+ model.fit(X_train_tfidf, y_train)
64
+
65
+ # Save the model and vectorizer
66
+ joblib.dump(model, "sentiment_model.pkl")
67
+ joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
68
+
69
+ # Load model and vectorizer for prediction
70
+ model = joblib.load("sentiment_model.pkl")
71
+ vectorizer = joblib.load("tfidf_vectorizer.pkl")
72
+
73
+ # Gradio prediction function
74
+ def predict_sentiment(review):
75
+ processed_review = preprocess_text(review) # Preprocess input
76
+ review_tfidf = vectorizer.transform([processed_review]) # Convert to TF-IDF
77
+ prediction = model.predict(review_tfidf)[0] # Get prediction
78
+ return f"Predicted Sentiment: {prediction}"
79
+
80
+ # Gradio UI
81
+ interface = gr.Interface(
82
+ fn=predict_sentiment,
83
+ inputs=gr.Textbox(label="Enter a Review"),
84
+ outputs=gr.Textbox(label="Sentiment Prediction"),
85
+ title="Movie Review Sentiment Analysis App",
86
+ description="Enter a review, and the model will predict if it's Positive, Negative, or Neutral."
87
+ )
88
+
89
+ # Launch the app
90
+ interface.launch()
test_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91d028cbb0228d5f70b3175f4ae5597c11ac42de4a77df5bfbdb19f00aa5f827
3
+ size 6120687