Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

34841ba

1 Parent(s): 719d51e

Update model/train.py

Browse files

Files changed (1) hide show

model/train.py +142 -52

model/train.py CHANGED Viewed

@@ -196,21 +196,26 @@ class RobustModelTrainer:
         self.progress_tracker = None
     def setup_paths(self):
-        """Setup all necessary paths"""
         self.base_dir = Path("/tmp")
         self.data_dir = self.base_dir / "data"
         self.model_dir = self.base_dir / "model"
         self.results_dir = self.base_dir / "results"
-        # Create directories
         for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
             dir_path.mkdir(parents=True, exist_ok=True)
         # File paths
         self.data_path = self.data_dir / "combined_dataset.csv"
-        self.model_path = self.model_dir / "model.pkl"
-        self.vectorizer_path = self.model_dir / "vectorizer.pkl"
-        self.pipeline_path = self.model_dir / "pipeline.pkl"
         self.metadata_path = Path("/tmp/metadata.json")
         self.evaluation_path = self.results_dir / "evaluation_results.json"
@@ -295,9 +300,13 @@ class RobustModelTrainer:
             if len(unique_labels) < 2:
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
-            # Check minimum sample size
-            if len(df) < 10:
-                return False, None, f"Insufficient samples for training: {len(df)}"
             # Check class balance
             label_counts = df['label'].value_counts()
@@ -379,29 +388,39 @@ class RobustModelTrainer:
         cm = confusion_matrix(y_test, y_pred)
         metrics['confusion_matrix'] = cm.tolist()
-        # Cross-validation scores if training data provided
-        if X_train is not None and y_train is not None and len(X_train) >= 50:
             try:
-                cv_scores = cross_val_score(
-                    model, X_train, y_train,
-                    cv=StratifiedKFold(
-                        n_splits=min(self.cv_folds, len(X_train) // 10),
-                        shuffle=True,
-                        random_state=self.random_state
-                    ),
-                    scoring='f1_weighted',
-                    n_jobs=-1  # Parallel CV
-                )
-                metrics['cv_scores'] = {
-                    'mean': float(cv_scores.mean()),
-                    'std': float(cv_scores.std()),
-                    'scores': cv_scores.tolist()
-                }
             except Exception as e:
                 logger.warning(f"Cross-validation failed: {e}")
-                metrics['cv_scores'] = None
         else:
-            metrics['cv_scores'] = {'note': 'Skipped for small dataset'}
         # Training accuracy for overfitting detection
         try:
@@ -426,11 +445,36 @@ class RobustModelTrainer:
             # Set the model in the pipeline
             pipeline.set_params(model=self.models[model_name]['model'])
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
-            # Adaptive CV folds based on dataset size
-            cv_folds = min(self.cv_folds, len(X_train) // 10, 5)
             # Create GridSearchCV
             grid_search = GridSearchCV(
@@ -439,7 +483,7 @@ class RobustModelTrainer:
                 cv=StratifiedKFold(n_splits=cv_folds,
                                    shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
-                n_jobs=-1,  # Use all cores
                 verbose=0   # Reduce verbosity for speed
             )
@@ -451,6 +495,7 @@ class RobustModelTrainer:
                 'best_params': grid_search.best_params_,
                 'best_score': float(grid_search.best_score_),
                 'best_estimator': grid_search.best_estimator_,
                 'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
@@ -465,12 +510,15 @@ class RobustModelTrainer:
             return grid_search.best_estimator_, tuning_results
         except Exception as e:
-            logger.error(
-                f"Hyperparameter tuning failed for {model_name}: {str(e)}")
             # Return basic model if tuning fails
-            pipeline.set_params(model=self.models[model_name]['model'])
-            pipeline.fit(X_train, y_train)
-            return pipeline, {'error': str(e)}
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
@@ -543,23 +591,36 @@ class RobustModelTrainer:
         return best_model_name, best_model, best_metrics
     def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
-        """Save model artifacts and metadata"""
         try:
             if self.progress_tracker:
                 self.progress_tracker.update("Saving model")
-            # Save the full pipeline
-            joblib.dump(model, self.pipeline_path)
-            logger.info(f"✅ Saved pipeline to {self.pipeline_path}")
             # Save individual components for backward compatibility
-            if hasattr(model, 'named_steps') and 'model' in model.named_steps:
-                joblib.dump(model.named_steps['model'], self.model_path)
-                logger.info(f"✅ Saved model to {self.model_path}")
-            if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
-                joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
-                logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
@@ -587,16 +648,27 @@ class RobustModelTrainer:
                 }
             }
-            # Save metadata
-            with open(self.metadata_path, 'w') as f:
-                json.dump(metadata, f, indent=2)
             logger.info(f"✅ Model artifacts saved successfully")
             return True
         except Exception as e:
             logger.error(f"Failed to save model artifacts: {str(e)}")
-            return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
@@ -634,16 +706,34 @@ class RobustModelTrainer:
             X = df['text'].values
             y = df['label'].values
-            # Train-test split
             self.progress_tracker.update("Splitting data")
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y,
-                test_size=self.test_size,
-                stratify=y if len(np.unique(y)) > 1 and len(y) > 10 else None,
                 random_state=self.random_state
             )
             logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
             results = self.train_and_evaluate_models(

         self.progress_tracker = None
     def setup_paths(self):
+        """Setup all necessary paths with proper permissions"""
         self.base_dir = Path("/tmp")
         self.data_dir = self.base_dir / "data"
         self.model_dir = self.base_dir / "model"
         self.results_dir = self.base_dir / "results"
+        # Create directories with proper permissions
         for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
             dir_path.mkdir(parents=True, exist_ok=True)
+            # Ensure write permissions
+            try:
+                dir_path.chmod(0o755)
+            except:
+                pass
         # File paths
         self.data_path = self.data_dir / "combined_dataset.csv"
+        self.model_path = Path("/tmp/model.pkl")  # Direct path to avoid permission issues
+        self.vectorizer_path = Path("/tmp/vectorizer.pkl")
+        self.pipeline_path = Path("/tmp/pipeline.pkl")
         self.metadata_path = Path("/tmp/metadata.json")
         self.evaluation_path = self.results_dir / "evaluation_results.json"
             if len(unique_labels) < 2:
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
+            # Check minimum sample size - more lenient
+            if len(df) < 6:
+                return False, None, f"Insufficient samples for training: {len(df)} (minimum: 6)"
+            # Warning for small datasets
+            if len(df) < 50:
+                logger.warning(f"Small dataset detected: {len(df)} samples. Results may be unreliable.")
             # Check class balance
             label_counts = df['label'].value_counts()
         cm = confusion_matrix(y_test, y_pred)
         metrics['confusion_matrix'] = cm.tolist()
+        # Smart cross-validation based on dataset size
+        if X_train is not None and y_train is not None and len(X_train) >= 20:
             try:
+                # Calculate appropriate CV folds for small datasets
+                n_samples = len(X_train)
+                min_samples_per_fold = 3  # Minimum samples per fold
+                max_folds = n_samples // min_samples_per_fold
+                cv_folds = max(2, min(self.cv_folds, max_folds))
+                if cv_folds >= 2:
+                    cv_scores = cross_val_score(
+                        model, X_train, y_train,
+                        cv=StratifiedKFold(
+                            n_splits=cv_folds,
+                            shuffle=True,
+                            random_state=self.random_state
+                        ),
+                        scoring='f1_weighted',
+                        n_jobs=1  # Single job for small datasets
+                    )
+                    metrics['cv_scores'] = {
+                        'mean': float(cv_scores.mean()),
+                        'std': float(cv_scores.std()),
+                        'scores': cv_scores.tolist(),
+                        'folds_used': cv_folds
+                    }
+                else:
+                    metrics['cv_scores'] = {'note': 'Dataset too small for reliable CV'}
             except Exception as e:
                 logger.warning(f"Cross-validation failed: {e}")
+                metrics['cv_scores'] = {'note': f'CV failed: {str(e)}'}
         else:
+            metrics['cv_scores'] = {'note': 'Skipped for very small dataset'}
         # Training accuracy for overfitting detection
         try:
             # Set the model in the pipeline
             pipeline.set_params(model=self.models[model_name]['model'])
+            # Skip hyperparameter tuning for very small datasets
+            if len(X_train) < 20:
+                logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
+                pipeline.fit(X_train, y_train)
+                return pipeline, {
+                    'best_params': 'default_parameters',
+                    'best_score': 'not_calculated',
+                    'best_estimator': pipeline,
+                    'note': 'Hyperparameter tuning skipped for small dataset'
+                }
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
+            # Calculate appropriate CV folds for small datasets
+            n_samples = len(X_train)
+            min_samples_per_fold = 3
+            max_folds = n_samples // min_samples_per_fold
+            cv_folds = max(2, min(self.cv_folds, max_folds))
+            if cv_folds < 2:
+                # Fallback to simple training
+                logger.info(f"Dataset too small for CV, using simple training for {model_name}")
+                pipeline.fit(X_train, y_train)
+                return pipeline, {
+                    'best_params': 'default_parameters',
+                    'best_score': 'not_calculated',
+                    'best_estimator': pipeline,
+                    'note': 'Simple training used due to very small dataset'
+                }
             # Create GridSearchCV
             grid_search = GridSearchCV(
                 cv=StratifiedKFold(n_splits=cv_folds,
                                    shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
+                n_jobs=1,  # Single job for small datasets
                 verbose=0   # Reduce verbosity for speed
             )
                 'best_params': grid_search.best_params_,
                 'best_score': float(grid_search.best_score_),
                 'best_estimator': grid_search.best_estimator_,
+                'cv_folds_used': cv_folds,
                 'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
             return grid_search.best_estimator_, tuning_results
         except Exception as e:
+            logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}")
             # Return basic model if tuning fails
+            try:
+                pipeline.set_params(model=self.models[model_name]['model'])
+                pipeline.fit(X_train, y_train)
+                return pipeline, {'error': str(e), 'fallback': 'simple_training'}
+            except Exception as e2:
+                logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
+                raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
         return best_model_name, best_model, best_metrics
     def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
+        """Save model artifacts and metadata with robust error handling"""
         try:
             if self.progress_tracker:
                 self.progress_tracker.update("Saving model")
+            # Save the full pipeline with error handling
+            try:
+                joblib.dump(model, self.pipeline_path)
+                logger.info(f"✅ Saved pipeline to {self.pipeline_path}")
+            except Exception as e:
+                logger.error(f"Failed to save pipeline: {e}")
+                # Try alternative path
+                alt_pipeline_path = Path("/tmp") / "pipeline.pkl"
+                joblib.dump(model, alt_pipeline_path)
+                logger.info(f"✅ Saved pipeline to {alt_pipeline_path}")
             # Save individual components for backward compatibility
+            try:
+                if hasattr(model, 'named_steps') and 'model' in model.named_steps:
+                    joblib.dump(model.named_steps['model'], self.model_path)
+                    logger.info(f"✅ Saved model to {self.model_path}")
+            except Exception as e:
+                logger.warning(f"Could not save model component: {e}")
+            try:
+                if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
+                    joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
+                    logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
+            except Exception as e:
+                logger.warning(f"Could not save vectorizer component: {e}")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
                 }
             }
+            # Save metadata with error handling
+            try:
+                with open(self.metadata_path, 'w') as f:
+                    json.dump(metadata, f, indent=2)
+                logger.info(f"✅ Saved metadata to {self.metadata_path}")
+            except Exception as e:
+                logger.warning(f"Could not save metadata: {e}")
             logger.info(f"✅ Model artifacts saved successfully")
             return True
         except Exception as e:
             logger.error(f"Failed to save model artifacts: {str(e)}")
+            # Try to save at least the core pipeline
+            try:
+                joblib.dump(model, Path("/tmp/pipeline_backup.pkl"))
+                logger.info("✅ Saved backup pipeline")
+                return True
+            except Exception as e2:
+                logger.error(f"Failed to save backup pipeline: {str(e2)}")
+                return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
             X = df['text'].values
             y = df['label'].values
+            # Train-test split with smart handling for small datasets
             self.progress_tracker.update("Splitting data")
+            # Ensure minimum test size for very small datasets
+            if len(X) < 10:
+                test_size = max(0.1, 1/len(X))  # At least 1 sample for test
+            else:
+                test_size = self.test_size
+            # Check if stratification is possible
+            label_counts = pd.Series(y).value_counts()
+            min_class_count = label_counts.min()
+            can_stratify = min_class_count >= 2 and len(y) >= 4
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y,
+                test_size=test_size,
+                stratify=y if can_stratify else None,
                 random_state=self.random_state
             )
             logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
+            # Additional validation for very small datasets
+            if len(X_train) < 3:
+                logger.warning(f"Very small training set: {len(X_train)} samples. Results may be unreliable.")
+            if len(X_test) < 1:
+                return False, "Cannot create test set. Dataset too small."
             # Train and evaluate models
             results = self.train_and_evaluate_models(