pravinai
/

sentilens-ai-sentiment-analysis

+"""
+SentilensAI - Machine Learning Training Pipeline
+This module provides comprehensive machine learning capabilities for training
+custom sentiment analysis models specifically optimized for AI chatbot conversations.
+Features:
+- Multiple ML algorithms (Random Forest, SVM, Neural Networks, XGBoost, etc.)
+- Advanced feature engineering for chatbot text
+- Cross-validation and hyperparameter tuning
+- Model comparison and evaluation
+- Production-ready model persistence
+- Real-time prediction capabilities
+Author: Pravin Selvamuthu
+Repository: https://github.com/kernelseed/sentilens-ai
+"""
+import os
+import json
+import pickle
+import logging
+from typing import Dict, List, Tuple, Optional, Any, Union
+from datetime import datetime
+from pathlib import Path
+import warnings
+warnings.filterwarnings('ignore')
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
+from sklearn.metrics import (
+    classification_report, confusion_matrix, accuracy_score,
+    precision_score, recall_score, f1_score, roc_auc_score,
+    balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score
+)
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
+from sklearn.svm import SVC
+from sklearn.neural_network import MLPClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.naive_bayes import GaussianNB, MultinomialNB
+from sklearn.pipeline import Pipeline
+from sklearn.calibration import CalibratedClassifierCV
+import joblib
+# Advanced ML libraries
+try:
+    import xgboost as xgb
+    XGBOOST_AVAILABLE = True
+except ImportError:
+    XGBOOST_AVAILABLE = False
+try:
+    import lightgbm as lgb
+    LIGHTGBM_AVAILABLE = True
+except ImportError:
+    LIGHTGBM_AVAILABLE = False
+try:
+    import catboost as cb
+    CATBOOST_AVAILABLE = True
+except ImportError:
+    CATBOOST_AVAILABLE = False
+# Visualization
+try:
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    PLOTTING_AVAILABLE = True
+except ImportError:
+    PLOTTING_AVAILABLE = False
+# LangChain integration
+from langchain.schema import BaseMessage
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain.llms import OpenAI
+# Import our sentiment analyzer
+from sentiment_analyzer import SentilensAIAnalyzer, SentimentResult
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class SentilensAITrainer:
+    """
+    Advanced machine learning trainer for sentiment analysis models
+    specifically designed for AI chatbot conversations
+    """
+    def __init__(self, model_cache_dir: str = "./model_cache"):
+        """
+        Initialize the SentimentsAI trainer
+        Args:
+            model_cache_dir: Directory to cache trained models
+        """
+        self.model_cache_dir = Path(model_cache_dir)
+        self.model_cache_dir.mkdir(exist_ok=True)
+        # Initialize components
+        self.analyzer = SentilensAIAnalyzer()
+        self.label_encoder = LabelEncoder()
+        self.scaler = RobustScaler()
+        self.vectorizer = None
+        self.models = {}
+        self.training_data = None
+        self.feature_names = None
+        # Initialize available models
+        self._initialize_models()
+        # Feature engineering parameters
+        self.feature_params = {
+            'max_features': 10000,
+            'ngram_range': (1, 3),
+            'min_df': 2,
+            'max_df': 0.95,
+            'stop_words': 'english'
+        }
+    def _initialize_models(self):
+        """Initialize available machine learning models"""
+        self.models = {
+            'random_forest': RandomForestClassifier(
+                n_estimators=100,
+                max_depth=10,
+                random_state=42,
+                n_jobs=-1
+            ),
+            'extra_trees': ExtraTreesClassifier(
+                n_estimators=100,
+                max_depth=10,
+                random_state=42,
+                n_jobs=-1
+            ),
+            'gradient_boosting': GradientBoostingClassifier(
+                n_estimators=100,
+                learning_rate=0.1,
+                max_depth=6,
+                random_state=42
+            ),
+            'svm': SVC(
+                kernel='rbf',
+                C=1.0,
+                gamma='scale',
+                random_state=42,
+                probability=True
+            ),
+            'neural_network': MLPClassifier(
+                hidden_layer_sizes=(100, 50),
+                activation='relu',
+                solver='adam',
+                alpha=0.001,
+                learning_rate='adaptive',
+                max_iter=500,
+                random_state=42
+            ),
+            'logistic_regression': LogisticRegression(
+                random_state=42,
+                max_iter=1000,
+                n_jobs=-1
+            ),
+            'decision_tree': DecisionTreeClassifier(
+                max_depth=10,
+                random_state=42
+            ),
+            'naive_bayes': MultinomialNB(alpha=1.0),
+            'ada_boost': AdaBoostClassifier(
+                n_estimators=50,
+                learning_rate=1.0,
+                random_state=42
+            )
+        }
+        # Add advanced models if available
+        if XGBOOST_AVAILABLE:
+            self.models['xgboost'] = xgb.XGBClassifier(
+                n_estimators=100,
+                max_depth=6,
+                learning_rate=0.1,
+                random_state=42,
+                n_jobs=-1
+            )
+        if LIGHTGBM_AVAILABLE:
+            self.models['lightgbm'] = lgb.LGBMClassifier(
+                n_estimators=100,
+                max_depth=6,
+                learning_rate=0.1,
+                random_state=42,
+                n_jobs=-1,
+                verbose=-1
+            )
+        if CATBOOST_AVAILABLE:
+            self.models['catboost'] = cb.CatBoostClassifier(
+                iterations=100,
+                depth=6,
+                learning_rate=0.1,
+                random_seed=42,
+                verbose=False
+            )
+    def create_synthetic_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
+        """
+        Create synthetic training data for sentiment analysis
+        Args:
+            num_samples: Number of samples to generate
+        Returns:
+            DataFrame with text and sentiment labels
+        """
+        logger.info(f"Creating {num_samples} synthetic training samples...")
+        # Define sentiment categories and sample texts
+        sentiment_data = {
+            'positive': [
+                "I love this chatbot! It's amazing and so helpful.",
+                "This is exactly what I needed. Thank you so much!",
+                "Great service! The bot understood me perfectly.",
+                "Excellent! This chatbot is fantastic and very user-friendly.",
+                "Perfect! I'm so happy with this experience.",
+                "Wonderful! The bot provided exactly the right information.",
+                "Outstanding service! I'm impressed with the quality.",
+                "Brilliant! This is the best chatbot I've ever used.",
+                "Fantastic! The response was quick and accurate.",
+                "Superb! I'm delighted with the help I received."
+            ],
+            'negative': [
+                "This chatbot is terrible. It doesn't understand anything.",
+                "Worst experience ever. The bot is completely useless.",
+                "This is awful. I'm frustrated and disappointed.",
+                "Horrible service! The bot keeps giving wrong answers.",
+                "Disgusting! This chatbot is a complete waste of time.",
+                "Terrible! I hate this bot and its responses.",
+                "Awful experience. The bot is stupid and unhelpful.",
+                "Disappointing! This chatbot is broken and useless.",
+                "Frustrating! The bot doesn't know what it's doing.",
+                "Pathetic! This is the worst chatbot I've ever seen."
+            ],
+            'neutral': [
+                "Can you help me with my account information?",
+                "I need to check my order status.",
+                "What are your business hours?",
+                "How do I reset my password?",
+                "I want to update my profile details.",
+                "Can you provide more information about this product?",
+                "I need assistance with my subscription.",
+                "What is your return policy?",
+                "How can I contact customer support?",
+                "I have a question about my recent purchase."
+            ]
+        }
+        # Generate synthetic data
+        data = []
+        samples_per_sentiment = num_samples // 3
+        for sentiment, texts in sentiment_data.items():
+            for i in range(samples_per_sentiment):
+                # Select base text
+                base_text = np.random.choice(texts)
+                # Add variations
+                variations = [
+                    base_text,
+                    base_text + " Please help me.",
+                    "Hi, " + base_text.lower(),
+                    base_text + " Thanks!",
+                    "Hello, " + base_text.lower(),
+                    base_text + " I appreciate it.",
+                    "Hey, " + base_text.lower(),
+                    base_text + " Could you assist?",
+                    "Good morning, " + base_text.lower(),
+                    base_text + " That would be great."
+                ]
+                text = np.random.choice(variations)
+                data.append({
+                    'text': text,
+                    'sentiment': sentiment,
+                    'confidence': np.random.uniform(0.6, 1.0),
+                    'polarity': np.random.uniform(-1, 1) if sentiment == 'neutral' else (1 if sentiment == 'positive' else -1),
+                    'subjectivity': np.random.uniform(0.3, 0.8),
+                    'message_type': 'user' if i % 2 == 0 else 'bot',
+                    'conversation_id': f'conv_{i//2}',
+                    'timestamp': datetime.now()
+                })
+        # Add some mixed sentiment examples
+        mixed_examples = [
+            ("I'm not sure if this is good or bad.", "neutral"),
+            ("It's okay, I guess.", "neutral"),
+            ("This is fine, nothing special.", "neutral"),
+            ("I have mixed feelings about this.", "neutral"),
+            ("It's decent but could be better.", "neutral")
+        ]
+        for text, sentiment in mixed_examples:
+            data.append({
+                'text': text,
+                'sentiment': sentiment,
+                'confidence': np.random.uniform(0.4, 0.7),
+                'polarity': np.random.uniform(-0.3, 0.3),
+                'subjectivity': np.random.uniform(0.5, 0.9),
+                'message_type': 'user',
+                'conversation_id': f'mixed_{len(data)}',
+                'timestamp': datetime.now()
+            })
+        df = pd.DataFrame(data)
+        logger.info(f"Created {len(df)} training samples")
+        return df
+    def extract_features(self, texts: List[str]) -> np.ndarray:
+        """
+        Extract comprehensive features from text data
+        Args:
+            texts: List of text strings
+        Returns:
+            Feature matrix
+        """
+        logger.info("Extracting features from text data...")
+        # Initialize vectorizer if not already done
+        if self.vectorizer is None:
+            self.vectorizer = TfidfVectorizer(
+                max_features=self.feature_params['max_features'],
+                ngram_range=self.feature_params['ngram_range'],
+                min_df=self.feature_params['min_df'],
+                max_df=self.feature_params['max_df'],
+                stop_words=self.feature_params['stop_words']
+            )
+        # TF-IDF features
+        tfidf_features = self.vectorizer.fit_transform(texts).toarray()
+        # Additional text features
+        text_features = []
+        for text in texts:
+            features = []
+            # Basic text statistics
+            features.append(len(text))  # Text length
+            features.append(len(text.split()))  # Word count
+            features.append(len([c for c in text if c.isupper()]))  # Uppercase count
+            features.append(len([c for c in text if c.isdigit()]))  # Digit count
+            features.append(len([c for c in text if c in '!?']))  # Punctuation count
+            # Sentiment features using our analyzer
+            try:
+                sentiment_result = self.analyzer.analyze_sentiment(text, method='ensemble')
+                features.extend([
+                    sentiment_result.polarity,
+                    sentiment_result.confidence,
+                    sentiment_result.subjectivity
+                ])
+                # Emotion features
+                for emotion in ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']:
+                    features.append(sentiment_result.emotions.get(emotion, 0.0))
+            except:
+                features.extend([0.0] * 9)  # Default values if analysis fails
+            # Text complexity features
+            words = text.split()
+            if words:
+                avg_word_length = np.mean([len(word) for word in words])
+                features.append(avg_word_length)
+            else:
+                features.append(0.0)
+            text_features.append(features)
+        text_features = np.array(text_features)
+        # Combine all features
+        all_features = np.hstack([tfidf_features, text_features])
+        logger.info(f"Extracted {all_features.shape[1]} features from {len(texts)} texts")
+        return all_features
+    def train_model(self, model_name: str, X: np.ndarray, y: np.ndarray,
+                   optimize_hyperparameters: bool = True) -> Dict[str, Any]:
+        """
+        Train a specific model
+        Args:
+            model_name: Name of the model to train
+            X: Feature matrix
+            y: Target labels
+            optimize_hyperparameters: Whether to optimize hyperparameters
+        Returns:
+            Training results dictionary
+        """
+        if model_name not in self.models:
+            raise ValueError(f"Unknown model: {model_name}")
+        logger.info(f"Training {model_name} model...")
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        # Scale features
+        X_train_scaled = self.scaler.fit_transform(X_train)
+        X_test_scaled = self.scaler.transform(X_test)
+        # Get base model
+        model = self.models[model_name]
+        # Optimize hyperparameters if requested
+        if optimize_hyperparameters:
+            model = self._optimize_hyperparameters(model, model_name, X_train_scaled, y_train)
+        # Train model
+        start_time = datetime.now()
+        model.fit(X_train_scaled, y_train)
+        training_time = (datetime.now() - start_time).total_seconds()
+        # Make predictions
+        y_pred = model.predict(X_test_scaled)
+        y_pred_proba = model.predict_proba(X_test_scaled) if hasattr(model, 'predict_proba') else None
+        # Evaluate model
+        results = self._evaluate_model(y_test, y_pred, y_pred_proba, model.classes_)
+        results.update({
+            'model_name': model_name,
+            'training_time': training_time,
+            'model': model,
+            'feature_importance': self._get_feature_importance(model, model_name)
+        })
+        # Store trained model
+        self.models[model_name] = model
+        logger.info(f"Training completed for {model_name}")
+        return results
+    def _optimize_hyperparameters(self, model, model_name: str, X: np.ndarray, y: np.ndarray):
+        """Optimize hyperparameters using GridSearchCV"""
+        param_grids = {
+            'random_forest': {
+                'n_estimators': [50, 100, 200],
+                'max_depth': [5, 10, 15, None],
+                'min_samples_split': [2, 5, 10]
+            },
+            'extra_trees': {
+                'n_estimators': [50, 100, 200],
+                'max_depth': [5, 10, 15, None],
+                'min_samples_split': [2, 5, 10]
+            },
+            'gradient_boosting': {
+                'n_estimators': [50, 100, 200],
+                'learning_rate': [0.01, 0.1, 0.2],
+                'max_depth': [3, 6, 10]
+            },
+            'svm': {
+                'C': [0.1, 1, 10, 100],
+                'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
+                'kernel': ['rbf', 'linear']
+            },
+            'neural_network': {
+                'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
+                'alpha': [0.0001, 0.001, 0.01],
+                'learning_rate': ['constant', 'adaptive']
+            },
+            'logistic_regression': {
+                'C': [0.1, 1, 10, 100],
+                'penalty': ['l1', 'l2'],
+                'solver': ['liblinear', 'saga']
+            },
+            'decision_tree': {
+                'max_depth': [5, 10, 15, None],
+                'min_samples_split': [2, 5, 10],
+                'min_samples_leaf': [1, 2, 4]
+            },
+            'naive_bayes': {
+                'alpha': [0.1, 0.5, 1.0, 2.0]
+            },
+            'ada_boost': {
+                'n_estimators': [25, 50, 100],
+                'learning_rate': [0.5, 1.0, 1.5]
+            }
+        }
+        if XGBOOST_AVAILABLE and model_name == 'xgboost':
+            param_grids['xgboost'] = {
+                'n_estimators': [50, 100, 200],
+                'max_depth': [3, 6, 10],
+                'learning_rate': [0.01, 0.1, 0.2]
+            }
+        if LIGHTGBM_AVAILABLE and model_name == 'lightgbm':
+            param_grids['lightgbm'] = {
+                'n_estimators': [50, 100, 200],
+                'max_depth': [3, 6, 10],
+                'learning_rate': [0.01, 0.1, 0.2]
+            }
+        if CATBOOST_AVAILABLE and model_name == 'catboost':
+            param_grids['catboost'] = {
+                'iterations': [50, 100, 200],
+                'depth': [3, 6, 10],
+                'learning_rate': [0.01, 0.1, 0.2]
+            }
+        if model_name in param_grids:
+            logger.info(f"Optimizing hyperparameters for {model_name}...")
+            grid_search = GridSearchCV(
+                model, param_grids[model_name],
+                cv=3, scoring='f1_macro', n_jobs=-1, verbose=0
+            )
+            grid_search.fit(X, y)
+            return grid_search.best_estimator_
+        return model
+    def _evaluate_model(self, y_true, y_pred, y_pred_proba, classes) -> Dict[str, Any]:
+        """Comprehensive model evaluation"""
+        results = {
+            'accuracy': accuracy_score(y_true, y_pred),
+            'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
+            'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
+            'precision_micro': precision_score(y_true, y_pred, average='micro', zero_division=0),
+            'precision_weighted': precision_score(y_true, y_pred, average='weighted', zero_division=0),
+            'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0),
+            'recall_micro': recall_score(y_true, y_pred, average='micro', zero_division=0),
+            'recall_weighted': recall_score(y_true, y_pred, average='weighted', zero_division=0),
+            'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
+            'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0),
+            'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0),
+            'matthews_corrcoef': matthews_corrcoef(y_true, y_pred),
+            'cohen_kappa': cohen_kappa_score(y_true, y_pred),
+            'classification_report': classification_report(y_true, y_pred, output_dict=True),
+            'confusion_matrix': confusion_matrix(y_true, y_pred).tolist()
+        }
+        # Add ROC AUC if probabilities are available
+        if y_pred_proba is not None and len(classes) > 2:
+            try:
+                results['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro')
+            except:
+                results['roc_auc'] = 0.0
+        elif y_pred_proba is not None and len(classes) == 2:
+            try:
+                results['roc_auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
+            except:
+                results['roc_auc'] = 0.0
+        else:
+            results['roc_auc'] = 0.0
+        return results
+    def _get_feature_importance(self, model, model_name: str) -> Optional[Dict[str, float]]:
+        """Get feature importance if available"""
+        try:
+            if hasattr(model, 'feature_importances_'):
+                importance = model.feature_importances_
+                if self.feature_names is not None:
+                    return dict(zip(self.feature_names, importance))
+                else:
+                    return {f'feature_{i}': imp for i, imp in enumerate(importance)}
+            elif hasattr(model, 'coef_'):
+                # For linear models, use absolute coefficients
+                coef = np.abs(model.coef_[0]) if len(model.coef_.shape) > 1 else np.abs(model.coef_)
+                if self.feature_names is not None:
+                    return dict(zip(self.feature_names, coef))
+                else:
+                    return {f'feature_{i}': imp for i, imp in enumerate(coef)}
+        except:
+            pass
+        return None
+    def compare_models(self, X: np.ndarray, y: np.ndarray,
+                      models_to_compare: Optional[List[str]] = None) -> Dict[str, Any]:
+        """
+        Compare multiple models using cross-validation
+        Args:
+            X: Feature matrix
+            y: Target labels
+            models_to_compare: List of model names to compare (None for all)
+        Returns:
+            Comparison results
+        """
+        if models_to_compare is None:
+            models_to_compare = list(self.models.keys())
+        logger.info(f"Comparing {len(models_to_compare)} models...")
+        # Scale features
+        X_scaled = self.scaler.fit_transform(X)
+        results = {}
+        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+        for model_name in models_to_compare:
+            if model_name not in self.models:
+                continue
+            logger.info(f"Evaluating {model_name}...")
+            model = self.models[model_name]
+            # Cross-validation scores
+            cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1_macro')
+            # Train and evaluate
+            model.fit(X_scaled, y)
+            y_pred = model.predict(X_scaled)
+            results[model_name] = {
+                'cv_mean': cv_scores.mean(),
+                'cv_std': cv_scores.std(),
+                'cv_scores': cv_scores.tolist(),
+                'accuracy': accuracy_score(y, y_pred),
+                'f1_macro': f1_score(y, y_pred, average='macro', zero_division=0),
+                'training_time': 0  # Could be measured if needed
+            }
+        # Sort by F1 score
+        sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['f1_macro'], reverse=True))
+        logger.info("Model comparison completed")
+        return sorted_results
+    def train_all_models(self, data: pd.DataFrame, optimize_hyperparameters: bool = True) -> Dict[str, Any]:
+        """
+        Train all available models
+        Args:
+            data: Training data DataFrame
+            optimize_hyperparameters: Whether to optimize hyperparameters
+        Returns:
+            Training results for all models
+        """
+        logger.info("Training all available models...")
+        # Prepare data
+        texts = data['text'].tolist()
+        labels = data['sentiment'].tolist()
+        # Extract features
+        X = self.extract_features(texts)
+        y = self.label_encoder.fit_transform(labels)
+        # Store feature names for importance analysis
+        if self.vectorizer is not None:
+            tfidf_features = self.vectorizer.get_feature_names_out()
+            additional_features = [
+                'text_length', 'word_count', 'uppercase_count', 'digit_count',
+                'punctuation_count', 'polarity', 'confidence', 'subjectivity',
+                'joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'avg_word_length'
+            ]
+            self.feature_names = list(tfidf_features) + additional_features
+        # Train all models
+        all_results = {}
+        for model_name in self.models.keys():
+            try:
+                results = self.train_model(model_name, X, y, optimize_hyperparameters)
+                all_results[model_name] = results
+                logger.info(f"✅ {model_name}: F1={results['f1_macro']:.3f}, Accuracy={results['accuracy']:.3f}")
+            except Exception as e:
+                logger.error(f"❌ Failed to train {model_name}: {e}")
+                all_results[model_name] = {'error': str(e)}
+        # Store training data
+        self.training_data = data
+        logger.info("All models training completed")
+        return all_results
+    def predict_sentiment(self, text: str, model_name: str = 'random_forest') -> Dict[str, Any]:
+        """
+        Predict sentiment for a single text using trained model
+        Args:
+            text: Text to analyze
+            model_name: Name of the model to use
+        Returns:
+            Prediction results
+        """
+        if model_name not in self.models:
+            raise ValueError(f"Model {model_name} not found. Available models: {list(self.models.keys())}")
+        if self.vectorizer is None:
+            raise ValueError("No trained model found. Please train a model first.")
+        # Extract features
+        X = self.extract_features([text])
+        X_scaled = self.scaler.transform(X)
+        # Make prediction
+        model = self.models[model_name]
+        prediction = model.predict(X_scaled)[0]
+        probabilities = model.predict_proba(X_scaled)[0] if hasattr(model, 'predict_proba') else None
+        # Decode prediction
+        sentiment = self.label_encoder.inverse_transform([prediction])[0]
+        result = {
+            'text': text,
+            'sentiment': sentiment,
+            'confidence': float(probabilities[prediction]) if probabilities is not None else 0.0,
+            'probabilities': {
+                label: float(prob) for label, prob in zip(self.label_encoder.classes_, probabilities)
+            } if probabilities is not None else None,
+            'model_used': model_name
+        }
+        return result
+    def save_model(self, model_name: str, filepath: str):
+        """Save trained model to file"""
+        if model_name not in self.models:
+            raise ValueError(f"Model {model_name} not found")
+        model_data = {
+            'model': self.models[model_name],
+            'label_encoder': self.label_encoder,
+            'scaler': self.scaler,
+            'vectorizer': self.vectorizer,
+            'feature_names': self.feature_names,
+            'feature_params': self.feature_params,
+            'training_data_info': {
+                'num_samples': len(self.training_data) if self.training_data is not None else 0,
+                'features': X.shape[1] if hasattr(self, 'X') else 0
+            } if self.training_data is not None else None
+        }
+        with open(filepath, 'wb') as f:
+            pickle.dump(model_data, f)
+        logger.info(f"Model {model_name} saved to {filepath}")
+    def load_model(self, filepath: str):
+        """Load trained model from file"""
+        with open(filepath, 'rb') as f:
+            model_data = pickle.load(f)
+        self.models['loaded'] = model_data['model']
+        self.label_encoder = model_data['label_encoder']
+        self.scaler = model_data['scaler']
+        self.vectorizer = model_data['vectorizer']
+        self.feature_names = model_data['feature_names']
+        self.feature_params = model_data['feature_params']
+        logger.info(f"Model loaded from {filepath}")
+    def get_training_summary(self) -> Dict[str, Any]:
+        """Get summary of training configuration and available models"""
+        return {
+            'available_models': list(self.models.keys()),
+            'xgboost_available': XGBOOST_AVAILABLE,
+            'lightgbm_available': LIGHTGBM_AVAILABLE,
+            'catboost_available': CATBOOST_AVAILABLE,
+            'plotting_available': PLOTTING_AVAILABLE,
+            'feature_params': self.feature_params,
+            'training_data_samples': len(self.training_data) if self.training_data is not None else 0,
+            'model_cache_dir': str(self.model_cache_dir)
+        }
+def main():
+    """Demo function to showcase SentimentsAI ML training capabilities"""
+    print("🤖 SentilensAI - Machine Learning Training Pipeline")
+    print("=" * 60)
+    # Initialize trainer
+    trainer = SentilensAITrainer()
+    # Get training summary
+    summary = trainer.get_training_summary()
+    print(f"\n📊 Training Configuration:")
+    print(f"Available Models: {len(summary['available_models'])}")
+    print(f"XGBoost Available: {summary['xgboost_available']}")
+    print(f"LightGBM Available: {summary['lightgbm_available']}")
+    print(f"CatBoost Available: {summary['catboost_available']}")
+    print(f"Plotting Available: {summary['plotting_available']}")
+    # Create synthetic training data
+    print(f"\n🔄 Creating synthetic training data...")
+    training_data = trainer.create_synthetic_training_data(num_samples=500)
+    print(f"Created {len(training_data)} training samples")
+    print(f"Sentiment distribution: {training_data['sentiment'].value_counts().to_dict()}")
+    # Train all models
+    print(f"\n🚀 Training all models...")
+    results = trainer.train_all_models(training_data, optimize_hyperparameters=True)
+    # Display results
+    print(f"\n📈 Training Results:")
+    print("-" * 60)
+    for model_name, result in results.items():
+        if 'error' not in result:
+            print(f"{model_name:20} | F1: {result['f1_macro']:.3f} | Accuracy: {result['accuracy']:.3f} | Time: {result['training_time']:.1f}s")
+        else:
+            print(f"{model_name:20} | Error: {result['error']}")
+    # Test prediction
+    print(f"\n🔮 Testing predictions...")
+    test_texts = [
+        "I love this chatbot! It's amazing!",
+        "This is terrible. I hate it.",
+        "Can you help me with my account?"
+    ]
+    for text in test_texts:
+        try:
+            prediction = trainer.predict_sentiment(text, 'random_forest')
+            print(f"Text: '{text}'")
+            print(f"Prediction: {prediction['sentiment']} (confidence: {prediction['confidence']:.3f})")
+        except Exception as e:
+            print(f"Prediction failed: {e}")
+        print()
+    # Save best model
+    best_model = max(results.keys(), key=lambda k: results[k].get('f1_macro', 0) if 'error' not in results[k] else 0)
+    if 'error' not in results[best_model]:
+        model_path = f"sentiments_ai_{best_model}_model.pkl"
+        trainer.save_model(best_model, model_path)
+        print(f"💾 Best model ({best_model}) saved to {model_path}")
+    print("\n✅ SentilensAI ML training demo completed!")
+    print("🚀 Ready for production sentiment analysis!")
+if __name__ == "__main__":
+    main()