Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer, WordNetLemmatizer | |
| import re | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Download NLTK data | |
| print("Downloading NLTK resources...") | |
| nltk.download('stopwords', quiet=True) | |
| nltk.download('wordnet', quiet=True) | |
| nltk.download('omw-1.4', quiet=True) | |
| nltk.download('punkt', quiet=True) | |
| print("β NLTK resources downloaded") | |
| # ============================================================================ | |
| # CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file | |
| # ============================================================================ | |
| class TextPreprocessor: | |
| """ | |
| Advanced text preprocessing pipeline for sentiment analysis. | |
| Features: | |
| - Lemmatization for better word normalization | |
| - Custom stopword filtering (preserves negation words) | |
| - URL and email removal | |
| - Special character cleaning | |
| - Case normalization | |
| """ | |
| def __init__(self, use_lemmatization=True, remove_stopwords=True): | |
| """ | |
| Initialize the preprocessor. | |
| Parameters: | |
| use_lemmatization (bool): Use lemmatization instead of stemming | |
| remove_stopwords (bool): Remove stopwords from text | |
| """ | |
| self.stemmer = PorterStemmer() | |
| self.lemmatizer = WordNetLemmatizer() | |
| self.use_lemmatization = use_lemmatization | |
| self.remove_stopwords = remove_stopwords | |
| # Custom stopwords excluding important sentiment words | |
| self.stop_words = set(stopwords.words('english')) | |
| # Remove negation words as they're crucial for sentiment | |
| negation_words = { | |
| 'not', 'no', 'nor', 'neither', 'never', 'none', | |
| 'nothing', 'nowhere', "don't", "doesn't", "didn't", | |
| "won't", "wouldn't", "can't", "couldn't", "shouldn't", | |
| "wasn't", "weren't", "hasn't", "haven't", "hadn't" | |
| } | |
| self.stop_words = self.stop_words - negation_words | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Clean and preprocess a single text string. | |
| Parameters: | |
| text (str): Raw text | |
| Returns: | |
| str: Cleaned text | |
| """ | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove URLs | |
| text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text) | |
| # Remove email addresses | |
| text = re.sub(r'\S+@\S+', ' ', text) | |
| # Remove HTML tags | |
| text = re.sub(r'<.*?>', ' ', text) | |
| # Remove special characters but keep spaces | |
| text = re.sub(r'[^a-zA-Z\s]', ' ', text) | |
| # Remove extra whitespaces | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Tokenize | |
| words = text.split() | |
| # Remove stopwords if enabled | |
| if self.remove_stopwords: | |
| words = [word for word in words if word not in self.stop_words] | |
| # Apply lemmatization or stemming | |
| if self.use_lemmatization: | |
| words = [self.lemmatizer.lemmatize(word, pos='v') for word in words] | |
| words = [self.lemmatizer.lemmatize(word, pos='n') for word in words] | |
| else: | |
| words = [self.stemmer.stem(word) for word in words] | |
| return ' '.join(words) | |
| def fit_transform(self, texts): | |
| """Process multiple texts.""" | |
| return [self.clean_text(text) for text in texts] | |
| def transform(self, texts): | |
| """Process multiple texts (alias for fit_transform).""" | |
| return self.fit_transform(texts) | |
| # ============================================================================ | |
| # Load models | |
| # ============================================================================ | |
| print("Loading models...") | |
| try: | |
| with open('best_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| print("β Model loaded") | |
| with open('tfidf_vectorizer.pkl', 'rb') as f: | |
| vectorizer = pickle.load(f) | |
| print("β Vectorizer loaded") | |
| with open('preprocessor.pkl', 'rb') as f: | |
| preprocessor = pickle.load(f) | |
| print("β Preprocessor loaded") | |
| except Exception as e: | |
| print(f"β Error loading models: {e}") | |
| raise | |
| # Feature extraction function | |
| def extract_features(texts, original_texts): | |
| """Extract statistical features from texts.""" | |
| features = { | |
| 'review_length': [len(text) for text in original_texts], | |
| 'word_count': [len(text.split()) for text in texts], | |
| 'avg_word_length': [ | |
| np.mean([len(word) for word in text.split()]) if text else 0 | |
| for text in texts | |
| ], | |
| 'exclamation_count': [text.count('!') for text in original_texts], | |
| 'question_count': [text.count('?') for text in original_texts], | |
| 'capital_ratio': [ | |
| sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0 | |
| for text in original_texts | |
| ] | |
| } | |
| return pd.DataFrame(features) | |
| # Prediction function | |
| def predict_sentiment(review_text): | |
| """Predict sentiment for a review.""" | |
| if not review_text or not review_text.strip(): | |
| return "β οΈ Please enter a review!", "", "", "", "" | |
| try: | |
| # Preprocess | |
| cleaned = preprocessor.clean_text(review_text) | |
| # Vectorize | |
| vectorized = vectorizer.transform([cleaned]).toarray() | |
| # Extract additional features | |
| add_features = extract_features([cleaned], [review_text]) | |
| # Combine features | |
| X_new = np.concatenate([vectorized, add_features.values], axis=1) | |
| # Predict | |
| prediction = model.predict(X_new)[0] | |
| # Get probabilities if available | |
| if hasattr(model, 'predict_proba'): | |
| proba = model.predict_proba(X_new)[0] | |
| confidence = max(proba) | |
| prob_neg = proba[0] | |
| prob_pos = proba[1] | |
| else: | |
| confidence = None | |
| prob_neg = None | |
| prob_pos = None | |
| # Format output | |
| sentiment = "β Positive π" if prediction == 1 else "β Negative π" | |
| conf_str = f"{confidence:.2%}" if confidence else "N/A" | |
| neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A" | |
| pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A" | |
| return sentiment, conf_str, neg_str, pos_str, cleaned | |
| except Exception as e: | |
| return f"β Error: {str(e)}", "", "", "", "" | |
| # Create Gradio interface | |
| print("Creating Gradio interface...") | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| title="Restaurant Review Sentiment Analyzer" | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π½οΈ Restaurant Review Sentiment Analyzer | |
| ### AI-Powered Sentiment Analysis with Machine Learning | |
| Enter a restaurant review to analyze its sentiment in real-time! | |
| **Model:** Advanced ML Classification | |
| **Accuracy:** 85%+ | |
| **Features:** TF-IDF + Statistical Text Analysis | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Enter Your Review") | |
| input_text = gr.Textbox( | |
| label="Restaurant Review", | |
| placeholder="e.g., The food was amazing and the service was excellent!", | |
| lines=5 | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("π Analyze Sentiment", variant="primary", size="lg") | |
| clear_btn = gr.ClearButton([input_text], value="ποΈ Clear", size="lg") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Analysis Results") | |
| sentiment_output = gr.Textbox(label="π― Predicted Sentiment", interactive=False) | |
| confidence_output = gr.Textbox(label="π Confidence Score", interactive=False) | |
| with gr.Row(): | |
| neg_prob = gr.Textbox(label="π Negative Probability", interactive=False) | |
| pos_prob = gr.Textbox(label="π Positive Probability", interactive=False) | |
| with gr.Accordion("π Preprocessing Details", open=False): | |
| cleaned_output = gr.Textbox( | |
| label="Cleaned Review Text (After Preprocessing)", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| gr.Markdown(""" | |
| **Preprocessing Steps:** | |
| 1. Convert to lowercase | |
| 2. Remove URLs, emails, HTML tags | |
| 3. Remove special characters | |
| 4. Remove stopwords (keep negations) | |
| 5. Apply lemmatization | |
| 6. Extract statistical features | |
| """) | |
| gr.Markdown("---") | |
| gr.Markdown("### π‘ Try These Example Reviews") | |
| gr.Examples( | |
| examples=[ | |
| ["The food was absolutely amazing! Best restaurant I've ever been to!"], | |
| ["Terrible service and the food was cold. Never coming back."], | |
| ["Outstanding! The staff was friendly and attentive."], | |
| ["Worst meal ever. Complete waste of money."], | |
| ["Good food but portions were small. Reasonable prices."], | |
| ["Fantastic! Every dish was cooked to perfection!"], | |
| ], | |
| inputs=input_text, | |
| label="Click to try" | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π About This Model | |
| **Machine Learning Pipeline:** | |
| - **Preprocessing:** Lemmatization, stopword removal, text normalization | |
| - **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features | |
| - **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting) | |
| - **Accuracy:** 85%+ on test data | |
| - **Metrics:** High precision, recall, and F1-score | |
| **Technologies:** Python β’ Scikit-learn β’ NLTK β’ Gradio β’ Pandas β’ NumPy | |
| **Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025 | |
| """) | |
| submit_btn.click( | |
| fn=predict_sentiment, | |
| inputs=input_text, | |
| outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output] | |
| ) | |
| print("β Gradio interface created") | |
| print("π Launching application...") | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) | |