Spaces:

Elliot89
/

sentiment-analysis-restaurant

Runtime error

App Files Files Community

sentiment-analysis-restaurant / app.py

Elliot89

Update app.py

2113fdf verified 7 months ago

raw

history blame contribute delete

10.4 kB

	import gradio as gr
	import pickle
	import numpy as np
	import pandas as pd
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	import re
	import warnings
	warnings.filterwarnings('ignore')

	# Download NLTK data
	print("Downloading NLTK resources...")
	nltk.download('stopwords', quiet=True)
	nltk.download('wordnet', quiet=True)
	nltk.download('omw-1.4', quiet=True)
	nltk.download('punkt', quiet=True)
	print("✅ NLTK resources downloaded")

	# ============================================================================
	# CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file
	# ============================================================================

	class TextPreprocessor:
	"""
	Advanced text preprocessing pipeline for sentiment analysis.

	Features:
	- Lemmatization for better word normalization
	- Custom stopword filtering (preserves negation words)
	- URL and email removal
	- Special character cleaning
	- Case normalization
	"""

	def __init__(self, use_lemmatization=True, remove_stopwords=True):
	"""
	Initialize the preprocessor.

	Parameters:
	use_lemmatization (bool): Use lemmatization instead of stemming
	remove_stopwords (bool): Remove stopwords from text
	"""
	self.stemmer = PorterStemmer()
	self.lemmatizer = WordNetLemmatizer()
	self.use_lemmatization = use_lemmatization
	self.remove_stopwords = remove_stopwords

	# Custom stopwords excluding important sentiment words
	self.stop_words = set(stopwords.words('english'))

	# Remove negation words as they're crucial for sentiment
	negation_words = {
	'not', 'no', 'nor', 'neither', 'never', 'none',
	'nothing', 'nowhere', "don't", "doesn't", "didn't",
	"won't", "wouldn't", "can't", "couldn't", "shouldn't",
	"wasn't", "weren't", "hasn't", "haven't", "hadn't"
	}
	self.stop_words = self.stop_words - negation_words

	def clean_text(self, text: str) -> str:
	"""
	Clean and preprocess a single text string.

	Parameters:
	text (str): Raw text

	Returns:
	str: Cleaned text
	"""
	# Convert to lowercase
	text = text.lower()

	# Remove URLs
	text = re.sub(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)

	# Remove email addresses
	text = re.sub(r'\S+@\S+', ' ', text)

	# Remove HTML tags
	text = re.sub(r'<.*?>', ' ', text)

	# Remove special characters but keep spaces
	text = re.sub(r'[^a-zA-Z\s]', ' ', text)

	# Remove extra whitespaces
	text = re.sub(r'\s+', ' ', text).strip()

	# Tokenize
	words = text.split()

	# Remove stopwords if enabled
	if self.remove_stopwords:
	words = [word for word in words if word not in self.stop_words]

	# Apply lemmatization or stemming
	if self.use_lemmatization:
	words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
	words = [self.lemmatizer.lemmatize(word, pos='n') for word in words]
	else:
	words = [self.stemmer.stem(word) for word in words]

	return ' '.join(words)

	def fit_transform(self, texts):
	"""Process multiple texts."""
	return [self.clean_text(text) for text in texts]

	def transform(self, texts):
	"""Process multiple texts (alias for fit_transform)."""
	return self.fit_transform(texts)

	# ============================================================================
	# Load models
	# ============================================================================

	print("Loading models...")
	try:
	with open('best_model.pkl', 'rb') as f:
	model = pickle.load(f)
	print("✅ Model loaded")

	with open('tfidf_vectorizer.pkl', 'rb') as f:
	vectorizer = pickle.load(f)
	print("✅ Vectorizer loaded")

	with open('preprocessor.pkl', 'rb') as f:
	preprocessor = pickle.load(f)
	print("✅ Preprocessor loaded")

	except Exception as e:
	print(f"❌ Error loading models: {e}")
	raise

	# Feature extraction function
	def extract_features(texts, original_texts):
	"""Extract statistical features from texts."""
	features = {
	'review_length': [len(text) for text in original_texts],
	'word_count': [len(text.split()) for text in texts],
	'avg_word_length': [
	np.mean([len(word) for word in text.split()]) if text else 0
	for text in texts
	],
	'exclamation_count': [text.count('!') for text in original_texts],
	'question_count': [text.count('?') for text in original_texts],
	'capital_ratio': [
	sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
	for text in original_texts
	]
	}
	return pd.DataFrame(features)

	# Prediction function
	def predict_sentiment(review_text):
	"""Predict sentiment for a review."""
	if not review_text or not review_text.strip():
	return "⚠️ Please enter a review!", "", "", "", ""

	try:
	# Preprocess
	cleaned = preprocessor.clean_text(review_text)

	# Vectorize
	vectorized = vectorizer.transform([cleaned]).toarray()

	# Extract additional features
	add_features = extract_features([cleaned], [review_text])

	# Combine features
	X_new = np.concatenate([vectorized, add_features.values], axis=1)

	# Predict
	prediction = model.predict(X_new)[0]

	# Get probabilities if available
	if hasattr(model, 'predict_proba'):
	proba = model.predict_proba(X_new)[0]
	confidence = max(proba)
	prob_neg = proba[0]
	prob_pos = proba[1]
	else:
	confidence = None
	prob_neg = None
	prob_pos = None

	# Format output
	sentiment = "✅ Positive 😊" if prediction == 1 else "❌ Negative 😞"
	conf_str = f"{confidence:.2%}" if confidence else "N/A"
	neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A"
	pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A"

	return sentiment, conf_str, neg_str, pos_str, cleaned

	except Exception as e:
	return f"❌ Error: {str(e)}", "", "", "", ""

	# Create Gradio interface
	print("Creating Gradio interface...")

	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="Restaurant Review Sentiment Analyzer"
	) as demo:

	gr.Markdown("""
	# 🍽️ Restaurant Review Sentiment Analyzer
	### AI-Powered Sentiment Analysis with Machine Learning

	Enter a restaurant review to analyze its sentiment in real-time!

	Model: Advanced ML Classification
	Accuracy: 85%+
	Features: TF-IDF + Statistical Text Analysis
	""")

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### 📝 Enter Your Review")
	input_text = gr.Textbox(
	label="Restaurant Review",
	placeholder="e.g., The food was amazing and the service was excellent!",
	lines=5
	)

	with gr.Row():
	submit_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", size="lg")
	clear_btn = gr.ClearButton([input_text], value="🗑️ Clear", size="lg")

	with gr.Column(scale=2):
	gr.Markdown("### 📊 Analysis Results")
	sentiment_output = gr.Textbox(label="🎯 Predicted Sentiment", interactive=False)
	confidence_output = gr.Textbox(label="📈 Confidence Score", interactive=False)

	with gr.Row():
	neg_prob = gr.Textbox(label="😞 Negative Probability", interactive=False)
	pos_prob = gr.Textbox(label="😊 Positive Probability", interactive=False)

	with gr.Accordion("🔍 Preprocessing Details", open=False):
	cleaned_output = gr.Textbox(
	label="Cleaned Review Text (After Preprocessing)",
	interactive=False,
	lines=3
	)
	gr.Markdown("""
	Preprocessing Steps:
	1. Convert to lowercase
	2. Remove URLs, emails, HTML tags
	3. Remove special characters
	4. Remove stopwords (keep negations)
	5. Apply lemmatization
	6. Extract statistical features
	""")

	gr.Markdown("---")
	gr.Markdown("### 💡 Try These Example Reviews")

	gr.Examples(
	examples=[
	["The food was absolutely amazing! Best restaurant I've ever been to!"],
	["Terrible service and the food was cold. Never coming back."],
	["Outstanding! The staff was friendly and attentive."],
	["Worst meal ever. Complete waste of money."],
	["Good food but portions were small. Reasonable prices."],
	["Fantastic! Every dish was cooked to perfection!"],
	],
	inputs=input_text,
	label="Click to try"
	)

	gr.Markdown("""
	---
	### 📚 About This Model

	Machine Learning Pipeline:
	- Preprocessing: Lemmatization, stopword removal, text normalization
	- Features: TF-IDF (1500 features, bigrams) + 6 statistical features
	- Algorithm: Ensemble machine learning (Random Forest / SVM / Gradient Boosting)
	- Accuracy: 85%+ on test data
	- Metrics: High precision, recall, and F1-score

	Technologies: Python • Scikit-learn • NLTK • Gradio • Pandas • NumPy

	Developer: Einstein Ellandala \| Project: ML-06-BML11 \| October 2025
	""")

	submit_btn.click(
	fn=predict_sentiment,
	inputs=input_text,
	outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output]
	)

	print("✅ Gradio interface created")
	print("🚀 Launching application...")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)