pravinai commited on
Commit
b9ab02b
ยท
verified ยท
1 Parent(s): 3236276

Add ml_training_pipeline.py

Browse files
Files changed (1) hide show
  1. ml_training_pipeline.py +844 -0
ml_training_pipeline.py ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SentilensAI - Machine Learning Training Pipeline
3
+
4
+ This module provides comprehensive machine learning capabilities for training
5
+ custom sentiment analysis models specifically optimized for AI chatbot conversations.
6
+
7
+ Features:
8
+ - Multiple ML algorithms (Random Forest, SVM, Neural Networks, XGBoost, etc.)
9
+ - Advanced feature engineering for chatbot text
10
+ - Cross-validation and hyperparameter tuning
11
+ - Model comparison and evaluation
12
+ - Production-ready model persistence
13
+ - Real-time prediction capabilities
14
+
15
+ Author: Pravin Selvamuthu
16
+ Repository: https://github.com/kernelseed/sentilens-ai
17
+ """
18
+
19
+ import os
20
+ import json
21
+ import pickle
22
+ import logging
23
+ from typing import Dict, List, Tuple, Optional, Any, Union
24
+ from datetime import datetime
25
+ from pathlib import Path
26
+ import warnings
27
+ warnings.filterwarnings('ignore')
28
+
29
+ import pandas as pd
30
+ import numpy as np
31
+ from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
32
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
33
+ from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
34
+ from sklearn.metrics import (
35
+ classification_report, confusion_matrix, accuracy_score,
36
+ precision_score, recall_score, f1_score, roc_auc_score,
37
+ balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score
38
+ )
39
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
40
+ from sklearn.svm import SVC
41
+ from sklearn.neural_network import MLPClassifier
42
+ from sklearn.linear_model import LogisticRegression
43
+ from sklearn.tree import DecisionTreeClassifier
44
+ from sklearn.naive_bayes import GaussianNB, MultinomialNB
45
+ from sklearn.pipeline import Pipeline
46
+ from sklearn.calibration import CalibratedClassifierCV
47
+ import joblib
48
+
49
+ # Advanced ML libraries
50
+ try:
51
+ import xgboost as xgb
52
+ XGBOOST_AVAILABLE = True
53
+ except ImportError:
54
+ XGBOOST_AVAILABLE = False
55
+
56
+ try:
57
+ import lightgbm as lgb
58
+ LIGHTGBM_AVAILABLE = True
59
+ except ImportError:
60
+ LIGHTGBM_AVAILABLE = False
61
+
62
+ try:
63
+ import catboost as cb
64
+ CATBOOST_AVAILABLE = True
65
+ except ImportError:
66
+ CATBOOST_AVAILABLE = False
67
+
68
+ # Visualization
69
+ try:
70
+ import matplotlib.pyplot as plt
71
+ import seaborn as sns
72
+ PLOTTING_AVAILABLE = True
73
+ except ImportError:
74
+ PLOTTING_AVAILABLE = False
75
+
76
+ # LangChain integration
77
+ from langchain.schema import BaseMessage
78
+ from langchain.prompts import PromptTemplate
79
+ from langchain.chains import LLMChain
80
+ from langchain.llms import OpenAI
81
+
82
+ # Import our sentiment analyzer
83
+ from sentiment_analyzer import SentilensAIAnalyzer, SentimentResult
84
+
85
+ # Configure logging
86
+ logging.basicConfig(level=logging.INFO)
87
+ logger = logging.getLogger(__name__)
88
+
89
+
90
+ class SentilensAITrainer:
91
+ """
92
+ Advanced machine learning trainer for sentiment analysis models
93
+ specifically designed for AI chatbot conversations
94
+ """
95
+
96
+ def __init__(self, model_cache_dir: str = "./model_cache"):
97
+ """
98
+ Initialize the SentimentsAI trainer
99
+
100
+ Args:
101
+ model_cache_dir: Directory to cache trained models
102
+ """
103
+ self.model_cache_dir = Path(model_cache_dir)
104
+ self.model_cache_dir.mkdir(exist_ok=True)
105
+
106
+ # Initialize components
107
+ self.analyzer = SentilensAIAnalyzer()
108
+ self.label_encoder = LabelEncoder()
109
+ self.scaler = RobustScaler()
110
+ self.vectorizer = None
111
+ self.models = {}
112
+ self.training_data = None
113
+ self.feature_names = None
114
+
115
+ # Initialize available models
116
+ self._initialize_models()
117
+
118
+ # Feature engineering parameters
119
+ self.feature_params = {
120
+ 'max_features': 10000,
121
+ 'ngram_range': (1, 3),
122
+ 'min_df': 2,
123
+ 'max_df': 0.95,
124
+ 'stop_words': 'english'
125
+ }
126
+
127
+ def _initialize_models(self):
128
+ """Initialize available machine learning models"""
129
+ self.models = {
130
+ 'random_forest': RandomForestClassifier(
131
+ n_estimators=100,
132
+ max_depth=10,
133
+ random_state=42,
134
+ n_jobs=-1
135
+ ),
136
+ 'extra_trees': ExtraTreesClassifier(
137
+ n_estimators=100,
138
+ max_depth=10,
139
+ random_state=42,
140
+ n_jobs=-1
141
+ ),
142
+ 'gradient_boosting': GradientBoostingClassifier(
143
+ n_estimators=100,
144
+ learning_rate=0.1,
145
+ max_depth=6,
146
+ random_state=42
147
+ ),
148
+ 'svm': SVC(
149
+ kernel='rbf',
150
+ C=1.0,
151
+ gamma='scale',
152
+ random_state=42,
153
+ probability=True
154
+ ),
155
+ 'neural_network': MLPClassifier(
156
+ hidden_layer_sizes=(100, 50),
157
+ activation='relu',
158
+ solver='adam',
159
+ alpha=0.001,
160
+ learning_rate='adaptive',
161
+ max_iter=500,
162
+ random_state=42
163
+ ),
164
+ 'logistic_regression': LogisticRegression(
165
+ random_state=42,
166
+ max_iter=1000,
167
+ n_jobs=-1
168
+ ),
169
+ 'decision_tree': DecisionTreeClassifier(
170
+ max_depth=10,
171
+ random_state=42
172
+ ),
173
+ 'naive_bayes': MultinomialNB(alpha=1.0),
174
+ 'ada_boost': AdaBoostClassifier(
175
+ n_estimators=50,
176
+ learning_rate=1.0,
177
+ random_state=42
178
+ )
179
+ }
180
+
181
+ # Add advanced models if available
182
+ if XGBOOST_AVAILABLE:
183
+ self.models['xgboost'] = xgb.XGBClassifier(
184
+ n_estimators=100,
185
+ max_depth=6,
186
+ learning_rate=0.1,
187
+ random_state=42,
188
+ n_jobs=-1
189
+ )
190
+
191
+ if LIGHTGBM_AVAILABLE:
192
+ self.models['lightgbm'] = lgb.LGBMClassifier(
193
+ n_estimators=100,
194
+ max_depth=6,
195
+ learning_rate=0.1,
196
+ random_state=42,
197
+ n_jobs=-1,
198
+ verbose=-1
199
+ )
200
+
201
+ if CATBOOST_AVAILABLE:
202
+ self.models['catboost'] = cb.CatBoostClassifier(
203
+ iterations=100,
204
+ depth=6,
205
+ learning_rate=0.1,
206
+ random_seed=42,
207
+ verbose=False
208
+ )
209
+
210
+ def create_synthetic_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
211
+ """
212
+ Create synthetic training data for sentiment analysis
213
+
214
+ Args:
215
+ num_samples: Number of samples to generate
216
+
217
+ Returns:
218
+ DataFrame with text and sentiment labels
219
+ """
220
+ logger.info(f"Creating {num_samples} synthetic training samples...")
221
+
222
+ # Define sentiment categories and sample texts
223
+ sentiment_data = {
224
+ 'positive': [
225
+ "I love this chatbot! It's amazing and so helpful.",
226
+ "This is exactly what I needed. Thank you so much!",
227
+ "Great service! The bot understood me perfectly.",
228
+ "Excellent! This chatbot is fantastic and very user-friendly.",
229
+ "Perfect! I'm so happy with this experience.",
230
+ "Wonderful! The bot provided exactly the right information.",
231
+ "Outstanding service! I'm impressed with the quality.",
232
+ "Brilliant! This is the best chatbot I've ever used.",
233
+ "Fantastic! The response was quick and accurate.",
234
+ "Superb! I'm delighted with the help I received."
235
+ ],
236
+ 'negative': [
237
+ "This chatbot is terrible. It doesn't understand anything.",
238
+ "Worst experience ever. The bot is completely useless.",
239
+ "This is awful. I'm frustrated and disappointed.",
240
+ "Horrible service! The bot keeps giving wrong answers.",
241
+ "Disgusting! This chatbot is a complete waste of time.",
242
+ "Terrible! I hate this bot and its responses.",
243
+ "Awful experience. The bot is stupid and unhelpful.",
244
+ "Disappointing! This chatbot is broken and useless.",
245
+ "Frustrating! The bot doesn't know what it's doing.",
246
+ "Pathetic! This is the worst chatbot I've ever seen."
247
+ ],
248
+ 'neutral': [
249
+ "Can you help me with my account information?",
250
+ "I need to check my order status.",
251
+ "What are your business hours?",
252
+ "How do I reset my password?",
253
+ "I want to update my profile details.",
254
+ "Can you provide more information about this product?",
255
+ "I need assistance with my subscription.",
256
+ "What is your return policy?",
257
+ "How can I contact customer support?",
258
+ "I have a question about my recent purchase."
259
+ ]
260
+ }
261
+
262
+ # Generate synthetic data
263
+ data = []
264
+ samples_per_sentiment = num_samples // 3
265
+
266
+ for sentiment, texts in sentiment_data.items():
267
+ for i in range(samples_per_sentiment):
268
+ # Select base text
269
+ base_text = np.random.choice(texts)
270
+
271
+ # Add variations
272
+ variations = [
273
+ base_text,
274
+ base_text + " Please help me.",
275
+ "Hi, " + base_text.lower(),
276
+ base_text + " Thanks!",
277
+ "Hello, " + base_text.lower(),
278
+ base_text + " I appreciate it.",
279
+ "Hey, " + base_text.lower(),
280
+ base_text + " Could you assist?",
281
+ "Good morning, " + base_text.lower(),
282
+ base_text + " That would be great."
283
+ ]
284
+
285
+ text = np.random.choice(variations)
286
+ data.append({
287
+ 'text': text,
288
+ 'sentiment': sentiment,
289
+ 'confidence': np.random.uniform(0.6, 1.0),
290
+ 'polarity': np.random.uniform(-1, 1) if sentiment == 'neutral' else (1 if sentiment == 'positive' else -1),
291
+ 'subjectivity': np.random.uniform(0.3, 0.8),
292
+ 'message_type': 'user' if i % 2 == 0 else 'bot',
293
+ 'conversation_id': f'conv_{i//2}',
294
+ 'timestamp': datetime.now()
295
+ })
296
+
297
+ # Add some mixed sentiment examples
298
+ mixed_examples = [
299
+ ("I'm not sure if this is good or bad.", "neutral"),
300
+ ("It's okay, I guess.", "neutral"),
301
+ ("This is fine, nothing special.", "neutral"),
302
+ ("I have mixed feelings about this.", "neutral"),
303
+ ("It's decent but could be better.", "neutral")
304
+ ]
305
+
306
+ for text, sentiment in mixed_examples:
307
+ data.append({
308
+ 'text': text,
309
+ 'sentiment': sentiment,
310
+ 'confidence': np.random.uniform(0.4, 0.7),
311
+ 'polarity': np.random.uniform(-0.3, 0.3),
312
+ 'subjectivity': np.random.uniform(0.5, 0.9),
313
+ 'message_type': 'user',
314
+ 'conversation_id': f'mixed_{len(data)}',
315
+ 'timestamp': datetime.now()
316
+ })
317
+
318
+ df = pd.DataFrame(data)
319
+ logger.info(f"Created {len(df)} training samples")
320
+ return df
321
+
322
+ def extract_features(self, texts: List[str]) -> np.ndarray:
323
+ """
324
+ Extract comprehensive features from text data
325
+
326
+ Args:
327
+ texts: List of text strings
328
+
329
+ Returns:
330
+ Feature matrix
331
+ """
332
+ logger.info("Extracting features from text data...")
333
+
334
+ # Initialize vectorizer if not already done
335
+ if self.vectorizer is None:
336
+ self.vectorizer = TfidfVectorizer(
337
+ max_features=self.feature_params['max_features'],
338
+ ngram_range=self.feature_params['ngram_range'],
339
+ min_df=self.feature_params['min_df'],
340
+ max_df=self.feature_params['max_df'],
341
+ stop_words=self.feature_params['stop_words']
342
+ )
343
+
344
+ # TF-IDF features
345
+ tfidf_features = self.vectorizer.fit_transform(texts).toarray()
346
+
347
+ # Additional text features
348
+ text_features = []
349
+ for text in texts:
350
+ features = []
351
+
352
+ # Basic text statistics
353
+ features.append(len(text)) # Text length
354
+ features.append(len(text.split())) # Word count
355
+ features.append(len([c for c in text if c.isupper()])) # Uppercase count
356
+ features.append(len([c for c in text if c.isdigit()])) # Digit count
357
+ features.append(len([c for c in text if c in '!?'])) # Punctuation count
358
+
359
+ # Sentiment features using our analyzer
360
+ try:
361
+ sentiment_result = self.analyzer.analyze_sentiment(text, method='ensemble')
362
+ features.extend([
363
+ sentiment_result.polarity,
364
+ sentiment_result.confidence,
365
+ sentiment_result.subjectivity
366
+ ])
367
+
368
+ # Emotion features
369
+ for emotion in ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']:
370
+ features.append(sentiment_result.emotions.get(emotion, 0.0))
371
+ except:
372
+ features.extend([0.0] * 9) # Default values if analysis fails
373
+
374
+ # Text complexity features
375
+ words = text.split()
376
+ if words:
377
+ avg_word_length = np.mean([len(word) for word in words])
378
+ features.append(avg_word_length)
379
+ else:
380
+ features.append(0.0)
381
+
382
+ text_features.append(features)
383
+
384
+ text_features = np.array(text_features)
385
+
386
+ # Combine all features
387
+ all_features = np.hstack([tfidf_features, text_features])
388
+
389
+ logger.info(f"Extracted {all_features.shape[1]} features from {len(texts)} texts")
390
+ return all_features
391
+
392
+ def train_model(self, model_name: str, X: np.ndarray, y: np.ndarray,
393
+ optimize_hyperparameters: bool = True) -> Dict[str, Any]:
394
+ """
395
+ Train a specific model
396
+
397
+ Args:
398
+ model_name: Name of the model to train
399
+ X: Feature matrix
400
+ y: Target labels
401
+ optimize_hyperparameters: Whether to optimize hyperparameters
402
+
403
+ Returns:
404
+ Training results dictionary
405
+ """
406
+ if model_name not in self.models:
407
+ raise ValueError(f"Unknown model: {model_name}")
408
+
409
+ logger.info(f"Training {model_name} model...")
410
+
411
+ # Split data
412
+ X_train, X_test, y_train, y_test = train_test_split(
413
+ X, y, test_size=0.2, random_state=42, stratify=y
414
+ )
415
+
416
+ # Scale features
417
+ X_train_scaled = self.scaler.fit_transform(X_train)
418
+ X_test_scaled = self.scaler.transform(X_test)
419
+
420
+ # Get base model
421
+ model = self.models[model_name]
422
+
423
+ # Optimize hyperparameters if requested
424
+ if optimize_hyperparameters:
425
+ model = self._optimize_hyperparameters(model, model_name, X_train_scaled, y_train)
426
+
427
+ # Train model
428
+ start_time = datetime.now()
429
+ model.fit(X_train_scaled, y_train)
430
+ training_time = (datetime.now() - start_time).total_seconds()
431
+
432
+ # Make predictions
433
+ y_pred = model.predict(X_test_scaled)
434
+ y_pred_proba = model.predict_proba(X_test_scaled) if hasattr(model, 'predict_proba') else None
435
+
436
+ # Evaluate model
437
+ results = self._evaluate_model(y_test, y_pred, y_pred_proba, model.classes_)
438
+ results.update({
439
+ 'model_name': model_name,
440
+ 'training_time': training_time,
441
+ 'model': model,
442
+ 'feature_importance': self._get_feature_importance(model, model_name)
443
+ })
444
+
445
+ # Store trained model
446
+ self.models[model_name] = model
447
+
448
+ logger.info(f"Training completed for {model_name}")
449
+ return results
450
+
451
+ def _optimize_hyperparameters(self, model, model_name: str, X: np.ndarray, y: np.ndarray):
452
+ """Optimize hyperparameters using GridSearchCV"""
453
+ param_grids = {
454
+ 'random_forest': {
455
+ 'n_estimators': [50, 100, 200],
456
+ 'max_depth': [5, 10, 15, None],
457
+ 'min_samples_split': [2, 5, 10]
458
+ },
459
+ 'extra_trees': {
460
+ 'n_estimators': [50, 100, 200],
461
+ 'max_depth': [5, 10, 15, None],
462
+ 'min_samples_split': [2, 5, 10]
463
+ },
464
+ 'gradient_boosting': {
465
+ 'n_estimators': [50, 100, 200],
466
+ 'learning_rate': [0.01, 0.1, 0.2],
467
+ 'max_depth': [3, 6, 10]
468
+ },
469
+ 'svm': {
470
+ 'C': [0.1, 1, 10, 100],
471
+ 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
472
+ 'kernel': ['rbf', 'linear']
473
+ },
474
+ 'neural_network': {
475
+ 'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
476
+ 'alpha': [0.0001, 0.001, 0.01],
477
+ 'learning_rate': ['constant', 'adaptive']
478
+ },
479
+ 'logistic_regression': {
480
+ 'C': [0.1, 1, 10, 100],
481
+ 'penalty': ['l1', 'l2'],
482
+ 'solver': ['liblinear', 'saga']
483
+ },
484
+ 'decision_tree': {
485
+ 'max_depth': [5, 10, 15, None],
486
+ 'min_samples_split': [2, 5, 10],
487
+ 'min_samples_leaf': [1, 2, 4]
488
+ },
489
+ 'naive_bayes': {
490
+ 'alpha': [0.1, 0.5, 1.0, 2.0]
491
+ },
492
+ 'ada_boost': {
493
+ 'n_estimators': [25, 50, 100],
494
+ 'learning_rate': [0.5, 1.0, 1.5]
495
+ }
496
+ }
497
+
498
+ if XGBOOST_AVAILABLE and model_name == 'xgboost':
499
+ param_grids['xgboost'] = {
500
+ 'n_estimators': [50, 100, 200],
501
+ 'max_depth': [3, 6, 10],
502
+ 'learning_rate': [0.01, 0.1, 0.2]
503
+ }
504
+
505
+ if LIGHTGBM_AVAILABLE and model_name == 'lightgbm':
506
+ param_grids['lightgbm'] = {
507
+ 'n_estimators': [50, 100, 200],
508
+ 'max_depth': [3, 6, 10],
509
+ 'learning_rate': [0.01, 0.1, 0.2]
510
+ }
511
+
512
+ if CATBOOST_AVAILABLE and model_name == 'catboost':
513
+ param_grids['catboost'] = {
514
+ 'iterations': [50, 100, 200],
515
+ 'depth': [3, 6, 10],
516
+ 'learning_rate': [0.01, 0.1, 0.2]
517
+ }
518
+
519
+ if model_name in param_grids:
520
+ logger.info(f"Optimizing hyperparameters for {model_name}...")
521
+ grid_search = GridSearchCV(
522
+ model, param_grids[model_name],
523
+ cv=3, scoring='f1_macro', n_jobs=-1, verbose=0
524
+ )
525
+ grid_search.fit(X, y)
526
+ return grid_search.best_estimator_
527
+
528
+ return model
529
+
530
+ def _evaluate_model(self, y_true, y_pred, y_pred_proba, classes) -> Dict[str, Any]:
531
+ """Comprehensive model evaluation"""
532
+ results = {
533
+ 'accuracy': accuracy_score(y_true, y_pred),
534
+ 'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
535
+ 'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
536
+ 'precision_micro': precision_score(y_true, y_pred, average='micro', zero_division=0),
537
+ 'precision_weighted': precision_score(y_true, y_pred, average='weighted', zero_division=0),
538
+ 'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0),
539
+ 'recall_micro': recall_score(y_true, y_pred, average='micro', zero_division=0),
540
+ 'recall_weighted': recall_score(y_true, y_pred, average='weighted', zero_division=0),
541
+ 'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
542
+ 'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0),
543
+ 'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0),
544
+ 'matthews_corrcoef': matthews_corrcoef(y_true, y_pred),
545
+ 'cohen_kappa': cohen_kappa_score(y_true, y_pred),
546
+ 'classification_report': classification_report(y_true, y_pred, output_dict=True),
547
+ 'confusion_matrix': confusion_matrix(y_true, y_pred).tolist()
548
+ }
549
+
550
+ # Add ROC AUC if probabilities are available
551
+ if y_pred_proba is not None and len(classes) > 2:
552
+ try:
553
+ results['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro')
554
+ except:
555
+ results['roc_auc'] = 0.0
556
+ elif y_pred_proba is not None and len(classes) == 2:
557
+ try:
558
+ results['roc_auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
559
+ except:
560
+ results['roc_auc'] = 0.0
561
+ else:
562
+ results['roc_auc'] = 0.0
563
+
564
+ return results
565
+
566
+ def _get_feature_importance(self, model, model_name: str) -> Optional[Dict[str, float]]:
567
+ """Get feature importance if available"""
568
+ try:
569
+ if hasattr(model, 'feature_importances_'):
570
+ importance = model.feature_importances_
571
+ if self.feature_names is not None:
572
+ return dict(zip(self.feature_names, importance))
573
+ else:
574
+ return {f'feature_{i}': imp for i, imp in enumerate(importance)}
575
+ elif hasattr(model, 'coef_'):
576
+ # For linear models, use absolute coefficients
577
+ coef = np.abs(model.coef_[0]) if len(model.coef_.shape) > 1 else np.abs(model.coef_)
578
+ if self.feature_names is not None:
579
+ return dict(zip(self.feature_names, coef))
580
+ else:
581
+ return {f'feature_{i}': imp for i, imp in enumerate(coef)}
582
+ except:
583
+ pass
584
+ return None
585
+
586
+ def compare_models(self, X: np.ndarray, y: np.ndarray,
587
+ models_to_compare: Optional[List[str]] = None) -> Dict[str, Any]:
588
+ """
589
+ Compare multiple models using cross-validation
590
+
591
+ Args:
592
+ X: Feature matrix
593
+ y: Target labels
594
+ models_to_compare: List of model names to compare (None for all)
595
+
596
+ Returns:
597
+ Comparison results
598
+ """
599
+ if models_to_compare is None:
600
+ models_to_compare = list(self.models.keys())
601
+
602
+ logger.info(f"Comparing {len(models_to_compare)} models...")
603
+
604
+ # Scale features
605
+ X_scaled = self.scaler.fit_transform(X)
606
+
607
+ results = {}
608
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
609
+
610
+ for model_name in models_to_compare:
611
+ if model_name not in self.models:
612
+ continue
613
+
614
+ logger.info(f"Evaluating {model_name}...")
615
+ model = self.models[model_name]
616
+
617
+ # Cross-validation scores
618
+ cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1_macro')
619
+
620
+ # Train and evaluate
621
+ model.fit(X_scaled, y)
622
+ y_pred = model.predict(X_scaled)
623
+
624
+ results[model_name] = {
625
+ 'cv_mean': cv_scores.mean(),
626
+ 'cv_std': cv_scores.std(),
627
+ 'cv_scores': cv_scores.tolist(),
628
+ 'accuracy': accuracy_score(y, y_pred),
629
+ 'f1_macro': f1_score(y, y_pred, average='macro', zero_division=0),
630
+ 'training_time': 0 # Could be measured if needed
631
+ }
632
+
633
+ # Sort by F1 score
634
+ sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['f1_macro'], reverse=True))
635
+
636
+ logger.info("Model comparison completed")
637
+ return sorted_results
638
+
639
+ def train_all_models(self, data: pd.DataFrame, optimize_hyperparameters: bool = True) -> Dict[str, Any]:
640
+ """
641
+ Train all available models
642
+
643
+ Args:
644
+ data: Training data DataFrame
645
+ optimize_hyperparameters: Whether to optimize hyperparameters
646
+
647
+ Returns:
648
+ Training results for all models
649
+ """
650
+ logger.info("Training all available models...")
651
+
652
+ # Prepare data
653
+ texts = data['text'].tolist()
654
+ labels = data['sentiment'].tolist()
655
+
656
+ # Extract features
657
+ X = self.extract_features(texts)
658
+ y = self.label_encoder.fit_transform(labels)
659
+
660
+ # Store feature names for importance analysis
661
+ if self.vectorizer is not None:
662
+ tfidf_features = self.vectorizer.get_feature_names_out()
663
+ additional_features = [
664
+ 'text_length', 'word_count', 'uppercase_count', 'digit_count',
665
+ 'punctuation_count', 'polarity', 'confidence', 'subjectivity',
666
+ 'joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'avg_word_length'
667
+ ]
668
+ self.feature_names = list(tfidf_features) + additional_features
669
+
670
+ # Train all models
671
+ all_results = {}
672
+ for model_name in self.models.keys():
673
+ try:
674
+ results = self.train_model(model_name, X, y, optimize_hyperparameters)
675
+ all_results[model_name] = results
676
+ logger.info(f"โœ… {model_name}: F1={results['f1_macro']:.3f}, Accuracy={results['accuracy']:.3f}")
677
+ except Exception as e:
678
+ logger.error(f"โŒ Failed to train {model_name}: {e}")
679
+ all_results[model_name] = {'error': str(e)}
680
+
681
+ # Store training data
682
+ self.training_data = data
683
+
684
+ logger.info("All models training completed")
685
+ return all_results
686
+
687
+ def predict_sentiment(self, text: str, model_name: str = 'random_forest') -> Dict[str, Any]:
688
+ """
689
+ Predict sentiment for a single text using trained model
690
+
691
+ Args:
692
+ text: Text to analyze
693
+ model_name: Name of the model to use
694
+
695
+ Returns:
696
+ Prediction results
697
+ """
698
+ if model_name not in self.models:
699
+ raise ValueError(f"Model {model_name} not found. Available models: {list(self.models.keys())}")
700
+
701
+ if self.vectorizer is None:
702
+ raise ValueError("No trained model found. Please train a model first.")
703
+
704
+ # Extract features
705
+ X = self.extract_features([text])
706
+ X_scaled = self.scaler.transform(X)
707
+
708
+ # Make prediction
709
+ model = self.models[model_name]
710
+ prediction = model.predict(X_scaled)[0]
711
+ probabilities = model.predict_proba(X_scaled)[0] if hasattr(model, 'predict_proba') else None
712
+
713
+ # Decode prediction
714
+ sentiment = self.label_encoder.inverse_transform([prediction])[0]
715
+
716
+ result = {
717
+ 'text': text,
718
+ 'sentiment': sentiment,
719
+ 'confidence': float(probabilities[prediction]) if probabilities is not None else 0.0,
720
+ 'probabilities': {
721
+ label: float(prob) for label, prob in zip(self.label_encoder.classes_, probabilities)
722
+ } if probabilities is not None else None,
723
+ 'model_used': model_name
724
+ }
725
+
726
+ return result
727
+
728
+ def save_model(self, model_name: str, filepath: str):
729
+ """Save trained model to file"""
730
+ if model_name not in self.models:
731
+ raise ValueError(f"Model {model_name} not found")
732
+
733
+ model_data = {
734
+ 'model': self.models[model_name],
735
+ 'label_encoder': self.label_encoder,
736
+ 'scaler': self.scaler,
737
+ 'vectorizer': self.vectorizer,
738
+ 'feature_names': self.feature_names,
739
+ 'feature_params': self.feature_params,
740
+ 'training_data_info': {
741
+ 'num_samples': len(self.training_data) if self.training_data is not None else 0,
742
+ 'features': X.shape[1] if hasattr(self, 'X') else 0
743
+ } if self.training_data is not None else None
744
+ }
745
+
746
+ with open(filepath, 'wb') as f:
747
+ pickle.dump(model_data, f)
748
+
749
+ logger.info(f"Model {model_name} saved to {filepath}")
750
+
751
+ def load_model(self, filepath: str):
752
+ """Load trained model from file"""
753
+ with open(filepath, 'rb') as f:
754
+ model_data = pickle.load(f)
755
+
756
+ self.models['loaded'] = model_data['model']
757
+ self.label_encoder = model_data['label_encoder']
758
+ self.scaler = model_data['scaler']
759
+ self.vectorizer = model_data['vectorizer']
760
+ self.feature_names = model_data['feature_names']
761
+ self.feature_params = model_data['feature_params']
762
+
763
+ logger.info(f"Model loaded from {filepath}")
764
+
765
+ def get_training_summary(self) -> Dict[str, Any]:
766
+ """Get summary of training configuration and available models"""
767
+ return {
768
+ 'available_models': list(self.models.keys()),
769
+ 'xgboost_available': XGBOOST_AVAILABLE,
770
+ 'lightgbm_available': LIGHTGBM_AVAILABLE,
771
+ 'catboost_available': CATBOOST_AVAILABLE,
772
+ 'plotting_available': PLOTTING_AVAILABLE,
773
+ 'feature_params': self.feature_params,
774
+ 'training_data_samples': len(self.training_data) if self.training_data is not None else 0,
775
+ 'model_cache_dir': str(self.model_cache_dir)
776
+ }
777
+
778
+
779
+ def main():
780
+ """Demo function to showcase SentimentsAI ML training capabilities"""
781
+ print("๐Ÿค– SentilensAI - Machine Learning Training Pipeline")
782
+ print("=" * 60)
783
+
784
+ # Initialize trainer
785
+ trainer = SentilensAITrainer()
786
+
787
+ # Get training summary
788
+ summary = trainer.get_training_summary()
789
+ print(f"\n๐Ÿ“Š Training Configuration:")
790
+ print(f"Available Models: {len(summary['available_models'])}")
791
+ print(f"XGBoost Available: {summary['xgboost_available']}")
792
+ print(f"LightGBM Available: {summary['lightgbm_available']}")
793
+ print(f"CatBoost Available: {summary['catboost_available']}")
794
+ print(f"Plotting Available: {summary['plotting_available']}")
795
+
796
+ # Create synthetic training data
797
+ print(f"\n๐Ÿ”„ Creating synthetic training data...")
798
+ training_data = trainer.create_synthetic_training_data(num_samples=500)
799
+ print(f"Created {len(training_data)} training samples")
800
+ print(f"Sentiment distribution: {training_data['sentiment'].value_counts().to_dict()}")
801
+
802
+ # Train all models
803
+ print(f"\n๐Ÿš€ Training all models...")
804
+ results = trainer.train_all_models(training_data, optimize_hyperparameters=True)
805
+
806
+ # Display results
807
+ print(f"\n๐Ÿ“ˆ Training Results:")
808
+ print("-" * 60)
809
+ for model_name, result in results.items():
810
+ if 'error' not in result:
811
+ print(f"{model_name:20} | F1: {result['f1_macro']:.3f} | Accuracy: {result['accuracy']:.3f} | Time: {result['training_time']:.1f}s")
812
+ else:
813
+ print(f"{model_name:20} | Error: {result['error']}")
814
+
815
+ # Test prediction
816
+ print(f"\n๐Ÿ”ฎ Testing predictions...")
817
+ test_texts = [
818
+ "I love this chatbot! It's amazing!",
819
+ "This is terrible. I hate it.",
820
+ "Can you help me with my account?"
821
+ ]
822
+
823
+ for text in test_texts:
824
+ try:
825
+ prediction = trainer.predict_sentiment(text, 'random_forest')
826
+ print(f"Text: '{text}'")
827
+ print(f"Prediction: {prediction['sentiment']} (confidence: {prediction['confidence']:.3f})")
828
+ except Exception as e:
829
+ print(f"Prediction failed: {e}")
830
+ print()
831
+
832
+ # Save best model
833
+ best_model = max(results.keys(), key=lambda k: results[k].get('f1_macro', 0) if 'error' not in results[k] else 0)
834
+ if 'error' not in results[best_model]:
835
+ model_path = f"sentiments_ai_{best_model}_model.pkl"
836
+ trainer.save_model(best_model, model_path)
837
+ print(f"๐Ÿ’พ Best model ({best_model}) saved to {model_path}")
838
+
839
+ print("\nโœ… SentilensAI ML training demo completed!")
840
+ print("๐Ÿš€ Ready for production sentiment analysis!")
841
+
842
+
843
+ if __name__ == "__main__":
844
+ main()