Ahmedik95316 commited on
Commit
34841ba
Β·
1 Parent(s): 719d51e

Update model/train.py

Browse files
Files changed (1) hide show
  1. model/train.py +142 -52
model/train.py CHANGED
@@ -196,21 +196,26 @@ class RobustModelTrainer:
196
  self.progress_tracker = None
197
 
198
  def setup_paths(self):
199
- """Setup all necessary paths"""
200
  self.base_dir = Path("/tmp")
201
  self.data_dir = self.base_dir / "data"
202
  self.model_dir = self.base_dir / "model"
203
  self.results_dir = self.base_dir / "results"
204
 
205
- # Create directories
206
  for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
207
  dir_path.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
208
 
209
  # File paths
210
  self.data_path = self.data_dir / "combined_dataset.csv"
211
- self.model_path = self.model_dir / "model.pkl"
212
- self.vectorizer_path = self.model_dir / "vectorizer.pkl"
213
- self.pipeline_path = self.model_dir / "pipeline.pkl"
214
  self.metadata_path = Path("/tmp/metadata.json")
215
  self.evaluation_path = self.results_dir / "evaluation_results.json"
216
 
@@ -295,9 +300,13 @@ class RobustModelTrainer:
295
  if len(unique_labels) < 2:
296
  return False, None, f"Need at least 2 classes, found: {unique_labels}"
297
 
298
- # Check minimum sample size
299
- if len(df) < 10:
300
- return False, None, f"Insufficient samples for training: {len(df)}"
 
 
 
 
301
 
302
  # Check class balance
303
  label_counts = df['label'].value_counts()
@@ -379,29 +388,39 @@ class RobustModelTrainer:
379
  cm = confusion_matrix(y_test, y_pred)
380
  metrics['confusion_matrix'] = cm.tolist()
381
 
382
- # Cross-validation scores if training data provided
383
- if X_train is not None and y_train is not None and len(X_train) >= 50:
384
  try:
385
- cv_scores = cross_val_score(
386
- model, X_train, y_train,
387
- cv=StratifiedKFold(
388
- n_splits=min(self.cv_folds, len(X_train) // 10),
389
- shuffle=True,
390
- random_state=self.random_state
391
- ),
392
- scoring='f1_weighted',
393
- n_jobs=-1 # Parallel CV
394
- )
395
- metrics['cv_scores'] = {
396
- 'mean': float(cv_scores.mean()),
397
- 'std': float(cv_scores.std()),
398
- 'scores': cv_scores.tolist()
399
- }
 
 
 
 
 
 
 
 
 
 
400
  except Exception as e:
401
  logger.warning(f"Cross-validation failed: {e}")
402
- metrics['cv_scores'] = None
403
  else:
404
- metrics['cv_scores'] = {'note': 'Skipped for small dataset'}
405
 
406
  # Training accuracy for overfitting detection
407
  try:
@@ -426,11 +445,36 @@ class RobustModelTrainer:
426
  # Set the model in the pipeline
427
  pipeline.set_params(model=self.models[model_name]['model'])
428
 
 
 
 
 
 
 
 
 
 
 
 
429
  # Get parameter grid
430
  param_grid = self.models[model_name]['param_grid']
431
 
432
- # Adaptive CV folds based on dataset size
433
- cv_folds = min(self.cv_folds, len(X_train) // 10, 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
  # Create GridSearchCV
436
  grid_search = GridSearchCV(
@@ -439,7 +483,7 @@ class RobustModelTrainer:
439
  cv=StratifiedKFold(n_splits=cv_folds,
440
  shuffle=True, random_state=self.random_state),
441
  scoring='f1_weighted',
442
- n_jobs=-1, # Use all cores
443
  verbose=0 # Reduce verbosity for speed
444
  )
445
 
@@ -451,6 +495,7 @@ class RobustModelTrainer:
451
  'best_params': grid_search.best_params_,
452
  'best_score': float(grid_search.best_score_),
453
  'best_estimator': grid_search.best_estimator_,
 
454
  'cv_results': {
455
  'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
456
  'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
@@ -465,12 +510,15 @@ class RobustModelTrainer:
465
  return grid_search.best_estimator_, tuning_results
466
 
467
  except Exception as e:
468
- logger.error(
469
- f"Hyperparameter tuning failed for {model_name}: {str(e)}")
470
  # Return basic model if tuning fails
471
- pipeline.set_params(model=self.models[model_name]['model'])
472
- pipeline.fit(X_train, y_train)
473
- return pipeline, {'error': str(e)}
 
 
 
 
474
 
475
  def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
476
  """Train and evaluate multiple models"""
@@ -543,23 +591,36 @@ class RobustModelTrainer:
543
  return best_model_name, best_model, best_metrics
544
 
545
  def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
546
- """Save model artifacts and metadata"""
547
  try:
548
  if self.progress_tracker:
549
  self.progress_tracker.update("Saving model")
550
 
551
- # Save the full pipeline
552
- joblib.dump(model, self.pipeline_path)
553
- logger.info(f"βœ… Saved pipeline to {self.pipeline_path}")
 
 
 
 
 
 
 
554
 
555
  # Save individual components for backward compatibility
556
- if hasattr(model, 'named_steps') and 'model' in model.named_steps:
557
- joblib.dump(model.named_steps['model'], self.model_path)
558
- logger.info(f"βœ… Saved model to {self.model_path}")
 
 
 
559
 
560
- if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
561
- joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
562
- logger.info(f"βœ… Saved vectorizer to {self.vectorizer_path}")
 
 
 
563
 
564
  # Generate data hash
565
  data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
@@ -587,16 +648,27 @@ class RobustModelTrainer:
587
  }
588
  }
589
 
590
- # Save metadata
591
- with open(self.metadata_path, 'w') as f:
592
- json.dump(metadata, f, indent=2)
 
 
 
 
593
 
594
  logger.info(f"βœ… Model artifacts saved successfully")
595
  return True
596
 
597
  except Exception as e:
598
  logger.error(f"Failed to save model artifacts: {str(e)}")
599
- return False
 
 
 
 
 
 
 
600
 
601
  def train_model(self, data_path: str = None) -> Tuple[bool, str]:
602
  """Main training function with comprehensive pipeline"""
@@ -634,16 +706,34 @@ class RobustModelTrainer:
634
  X = df['text'].values
635
  y = df['label'].values
636
 
637
- # Train-test split
638
  self.progress_tracker.update("Splitting data")
 
 
 
 
 
 
 
 
 
 
 
 
639
  X_train, X_test, y_train, y_test = train_test_split(
640
  X, y,
641
- test_size=self.test_size,
642
- stratify=y if len(np.unique(y)) > 1 and len(y) > 10 else None,
643
  random_state=self.random_state
644
  )
645
 
646
  logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
 
 
 
 
 
 
647
 
648
  # Train and evaluate models
649
  results = self.train_and_evaluate_models(
 
196
  self.progress_tracker = None
197
 
198
  def setup_paths(self):
199
+ """Setup all necessary paths with proper permissions"""
200
  self.base_dir = Path("/tmp")
201
  self.data_dir = self.base_dir / "data"
202
  self.model_dir = self.base_dir / "model"
203
  self.results_dir = self.base_dir / "results"
204
 
205
+ # Create directories with proper permissions
206
  for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
207
  dir_path.mkdir(parents=True, exist_ok=True)
208
+ # Ensure write permissions
209
+ try:
210
+ dir_path.chmod(0o755)
211
+ except:
212
+ pass
213
 
214
  # File paths
215
  self.data_path = self.data_dir / "combined_dataset.csv"
216
+ self.model_path = Path("/tmp/model.pkl") # Direct path to avoid permission issues
217
+ self.vectorizer_path = Path("/tmp/vectorizer.pkl")
218
+ self.pipeline_path = Path("/tmp/pipeline.pkl")
219
  self.metadata_path = Path("/tmp/metadata.json")
220
  self.evaluation_path = self.results_dir / "evaluation_results.json"
221
 
 
300
  if len(unique_labels) < 2:
301
  return False, None, f"Need at least 2 classes, found: {unique_labels}"
302
 
303
+ # Check minimum sample size - more lenient
304
+ if len(df) < 6:
305
+ return False, None, f"Insufficient samples for training: {len(df)} (minimum: 6)"
306
+
307
+ # Warning for small datasets
308
+ if len(df) < 50:
309
+ logger.warning(f"Small dataset detected: {len(df)} samples. Results may be unreliable.")
310
 
311
  # Check class balance
312
  label_counts = df['label'].value_counts()
 
388
  cm = confusion_matrix(y_test, y_pred)
389
  metrics['confusion_matrix'] = cm.tolist()
390
 
391
+ # Smart cross-validation based on dataset size
392
+ if X_train is not None and y_train is not None and len(X_train) >= 20:
393
  try:
394
+ # Calculate appropriate CV folds for small datasets
395
+ n_samples = len(X_train)
396
+ min_samples_per_fold = 3 # Minimum samples per fold
397
+ max_folds = n_samples // min_samples_per_fold
398
+ cv_folds = max(2, min(self.cv_folds, max_folds))
399
+
400
+ if cv_folds >= 2:
401
+ cv_scores = cross_val_score(
402
+ model, X_train, y_train,
403
+ cv=StratifiedKFold(
404
+ n_splits=cv_folds,
405
+ shuffle=True,
406
+ random_state=self.random_state
407
+ ),
408
+ scoring='f1_weighted',
409
+ n_jobs=1 # Single job for small datasets
410
+ )
411
+ metrics['cv_scores'] = {
412
+ 'mean': float(cv_scores.mean()),
413
+ 'std': float(cv_scores.std()),
414
+ 'scores': cv_scores.tolist(),
415
+ 'folds_used': cv_folds
416
+ }
417
+ else:
418
+ metrics['cv_scores'] = {'note': 'Dataset too small for reliable CV'}
419
  except Exception as e:
420
  logger.warning(f"Cross-validation failed: {e}")
421
+ metrics['cv_scores'] = {'note': f'CV failed: {str(e)}'}
422
  else:
423
+ metrics['cv_scores'] = {'note': 'Skipped for very small dataset'}
424
 
425
  # Training accuracy for overfitting detection
426
  try:
 
445
  # Set the model in the pipeline
446
  pipeline.set_params(model=self.models[model_name]['model'])
447
 
448
+ # Skip hyperparameter tuning for very small datasets
449
+ if len(X_train) < 20:
450
+ logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
451
+ pipeline.fit(X_train, y_train)
452
+ return pipeline, {
453
+ 'best_params': 'default_parameters',
454
+ 'best_score': 'not_calculated',
455
+ 'best_estimator': pipeline,
456
+ 'note': 'Hyperparameter tuning skipped for small dataset'
457
+ }
458
+
459
  # Get parameter grid
460
  param_grid = self.models[model_name]['param_grid']
461
 
462
+ # Calculate appropriate CV folds for small datasets
463
+ n_samples = len(X_train)
464
+ min_samples_per_fold = 3
465
+ max_folds = n_samples // min_samples_per_fold
466
+ cv_folds = max(2, min(self.cv_folds, max_folds))
467
+
468
+ if cv_folds < 2:
469
+ # Fallback to simple training
470
+ logger.info(f"Dataset too small for CV, using simple training for {model_name}")
471
+ pipeline.fit(X_train, y_train)
472
+ return pipeline, {
473
+ 'best_params': 'default_parameters',
474
+ 'best_score': 'not_calculated',
475
+ 'best_estimator': pipeline,
476
+ 'note': 'Simple training used due to very small dataset'
477
+ }
478
 
479
  # Create GridSearchCV
480
  grid_search = GridSearchCV(
 
483
  cv=StratifiedKFold(n_splits=cv_folds,
484
  shuffle=True, random_state=self.random_state),
485
  scoring='f1_weighted',
486
+ n_jobs=1, # Single job for small datasets
487
  verbose=0 # Reduce verbosity for speed
488
  )
489
 
 
495
  'best_params': grid_search.best_params_,
496
  'best_score': float(grid_search.best_score_),
497
  'best_estimator': grid_search.best_estimator_,
498
+ 'cv_folds_used': cv_folds,
499
  'cv_results': {
500
  'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
501
  'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
 
510
  return grid_search.best_estimator_, tuning_results
511
 
512
  except Exception as e:
513
+ logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}")
 
514
  # Return basic model if tuning fails
515
+ try:
516
+ pipeline.set_params(model=self.models[model_name]['model'])
517
+ pipeline.fit(X_train, y_train)
518
+ return pipeline, {'error': str(e), 'fallback': 'simple_training'}
519
+ except Exception as e2:
520
+ logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
521
+ raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
522
 
523
  def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
524
  """Train and evaluate multiple models"""
 
591
  return best_model_name, best_model, best_metrics
592
 
593
  def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
594
+ """Save model artifacts and metadata with robust error handling"""
595
  try:
596
  if self.progress_tracker:
597
  self.progress_tracker.update("Saving model")
598
 
599
+ # Save the full pipeline with error handling
600
+ try:
601
+ joblib.dump(model, self.pipeline_path)
602
+ logger.info(f"βœ… Saved pipeline to {self.pipeline_path}")
603
+ except Exception as e:
604
+ logger.error(f"Failed to save pipeline: {e}")
605
+ # Try alternative path
606
+ alt_pipeline_path = Path("/tmp") / "pipeline.pkl"
607
+ joblib.dump(model, alt_pipeline_path)
608
+ logger.info(f"βœ… Saved pipeline to {alt_pipeline_path}")
609
 
610
  # Save individual components for backward compatibility
611
+ try:
612
+ if hasattr(model, 'named_steps') and 'model' in model.named_steps:
613
+ joblib.dump(model.named_steps['model'], self.model_path)
614
+ logger.info(f"βœ… Saved model to {self.model_path}")
615
+ except Exception as e:
616
+ logger.warning(f"Could not save model component: {e}")
617
 
618
+ try:
619
+ if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
620
+ joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
621
+ logger.info(f"βœ… Saved vectorizer to {self.vectorizer_path}")
622
+ except Exception as e:
623
+ logger.warning(f"Could not save vectorizer component: {e}")
624
 
625
  # Generate data hash
626
  data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
 
648
  }
649
  }
650
 
651
+ # Save metadata with error handling
652
+ try:
653
+ with open(self.metadata_path, 'w') as f:
654
+ json.dump(metadata, f, indent=2)
655
+ logger.info(f"βœ… Saved metadata to {self.metadata_path}")
656
+ except Exception as e:
657
+ logger.warning(f"Could not save metadata: {e}")
658
 
659
  logger.info(f"βœ… Model artifacts saved successfully")
660
  return True
661
 
662
  except Exception as e:
663
  logger.error(f"Failed to save model artifacts: {str(e)}")
664
+ # Try to save at least the core pipeline
665
+ try:
666
+ joblib.dump(model, Path("/tmp/pipeline_backup.pkl"))
667
+ logger.info("βœ… Saved backup pipeline")
668
+ return True
669
+ except Exception as e2:
670
+ logger.error(f"Failed to save backup pipeline: {str(e2)}")
671
+ return False
672
 
673
  def train_model(self, data_path: str = None) -> Tuple[bool, str]:
674
  """Main training function with comprehensive pipeline"""
 
706
  X = df['text'].values
707
  y = df['label'].values
708
 
709
+ # Train-test split with smart handling for small datasets
710
  self.progress_tracker.update("Splitting data")
711
+
712
+ # Ensure minimum test size for very small datasets
713
+ if len(X) < 10:
714
+ test_size = max(0.1, 1/len(X)) # At least 1 sample for test
715
+ else:
716
+ test_size = self.test_size
717
+
718
+ # Check if stratification is possible
719
+ label_counts = pd.Series(y).value_counts()
720
+ min_class_count = label_counts.min()
721
+ can_stratify = min_class_count >= 2 and len(y) >= 4
722
+
723
  X_train, X_test, y_train, y_test = train_test_split(
724
  X, y,
725
+ test_size=test_size,
726
+ stratify=y if can_stratify else None,
727
  random_state=self.random_state
728
  )
729
 
730
  logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
731
+
732
+ # Additional validation for very small datasets
733
+ if len(X_train) < 3:
734
+ logger.warning(f"Very small training set: {len(X_train)} samples. Results may be unreliable.")
735
+ if len(X_test) < 1:
736
+ return False, "Cannot create test set. Dataset too small."
737
 
738
  # Train and evaluate models
739
  results = self.train_and_evaluate_models(