Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

ce7aca5

1 Parent(s): 04e5963

Update app/streamlit_app.py

Browse files

Files changed (1) hide show

app/streamlit_app.py +233 -35

app/streamlit_app.py CHANGED Viewed

@@ -281,6 +281,50 @@ def create_prediction_history_chart():
     return fig
 def estimate_training_time_streamlit(dataset_size: int) -> dict:
     """Estimate training time for Streamlit display"""
     if estimate_training_time:
@@ -351,42 +395,170 @@ def render_enhanced_training_section(df_train):
             st.plotly_chart(fig_labels, use_container_width=True)
     # Training configuration
-    with st.expander("⚙️ Training Configuration"):
         col1, col2 = st.columns(2)
         with col1:
-            if dataset_size < 20:
-                st.warning("⚠️ Very small dataset: Hyperparameter tuning will be skipped")
-                st.info("• Simple training only")
-                st.info("• Minimal cross-validation")
-            elif dataset_size < 50:
-                st.info("ℹ️ Small dataset: Limited hyperparameter tuning")
-                st.info("• Reduced parameter grids")
-                st.info("• 2-3 fold cross-validation")
-            else:
-                st.success("✅ Standard dataset: Full training pipeline")
-                st.info("• Complete hyperparameter tuning")
-                st.info("• 3-fold cross-validation")
-                st.info("• Model comparison")
         with col2:
-            st.write("**Expected Features:**")
-            st.write(f"• TF-IDF vectorization")
-            st.write(f"• Feature selection")
-            st.write(f"• Logistic Regression")
-            if dataset_size >= 50:
-                st.write(f"• Random Forest comparison")
-            st.write(f"• Performance evaluation")
     # Training button and execution
     if st.button("🏃‍♂️ Start Training", type="primary", use_container_width=True):
-        # Save training data
         app_manager.paths['custom_data'].parent.mkdir(parents=True, exist_ok=True)
         df_train.to_csv(app_manager.paths['custom_data'], index=False)
         st.markdown("---")
         st.markdown("### 🔄 Training Progress")
         # Progress containers
         progress_col1, progress_col2 = st.columns([3, 1])
@@ -402,17 +574,30 @@ def render_enhanced_training_section(df_train):
         if DIRECT_TRAINING_AVAILABLE:
             # Method 1: Direct function call (shows progress in real-time)
-            status_text.text("Status: Initializing direct training...")
             progress_bar.progress(5)
             try:
                 # Create output capture
                 output_buffer = io.StringIO()
-                with st.spinner("Training model (direct method)..."):
                     # Redirect stdout to capture progress
                     with contextlib.redirect_stdout(output_buffer):
-                        trainer = RobustModelTrainer()
                         success, message = trainer.train_model(
                             data_path=str(app_manager.paths['custom_data'])
                         )
@@ -431,6 +616,10 @@ def render_enhanced_training_section(df_train):
                     st.success("🎉 **Training Completed Successfully!**")
                     st.info(f"📊 **{message}**")
                     # Show captured progress if available
                     if captured_output:
                         with st.expander("📈 Training Progress Details"):
@@ -451,18 +640,22 @@ def render_enhanced_training_section(df_train):
             progress_bar.progress(10)
             try:
-                # Simulate progress during subprocess execution
                 progress_steps = [
                     (20, "Loading and validating data..."),
-                    (40, "Creating preprocessing pipeline..."),
-                    (60, "Training models..."),
-                    (80, "Evaluating performance..."),
                     (95, "Saving model artifacts...")
                 ]
-                # Start subprocess
                 process = subprocess.Popen(
-                    [sys.executable, "model/train.py", "--data_path", str(app_manager.paths['custom_data'])],
                     stdout=subprocess.PIPE,
                     stderr=subprocess.STDOUT,
                     universal_newlines=True
@@ -474,9 +667,9 @@ def render_enhanced_training_section(df_train):
                     elapsed = time.time() - start_time
                     time_display.text(f"Elapsed: {timedelta(seconds=int(elapsed))}")
-                    # Update progress based on elapsed time
                     if step_idx < len(progress_steps):
-                        expected_time = dataset_size * 0.1  # Rough estimate
                         if elapsed > expected_time * (step_idx + 1) / len(progress_steps):
                             progress, status = progress_steps[step_idx]
                             progress_bar.progress(progress)
@@ -498,6 +691,10 @@ def render_enhanced_training_section(df_train):
                 if process.returncode == 0:
                     st.success("🎉 **Training Completed Successfully!**")
                     # Extract performance info from output
                     if stdout:
                         lines = stdout.strip().split('\n')
@@ -514,7 +711,8 @@ def render_enhanced_training_section(df_train):
                 else:
                     st.error("❌ **Training Failed**")
-                    st.code(stdout)
             except Exception as e:
                 st.error(f"❌ **Training Error:** {str(e)}")

     return fig
+def estimate_detailed_training_time(dataset_size: int, enable_tuning: bool, cv_folds: int, num_models: int, max_features: int) -> str:
+    """Estimate training time based on detailed parameters"""
+    # Base time per sample (in seconds)
+    base_time_per_sample = 0.01
+    # Feature complexity multiplier
+    feature_multiplier = max_features / 5000  # Normalized to 5000 features
+    # Cross-validation multiplier
+    cv_multiplier = cv_folds
+    # Hyperparameter tuning multiplier
+    tuning_multiplier = 8 if enable_tuning else 1
+    # Model count multiplier
+    model_multiplier = num_models
+    # Calculate total time
+    total_seconds = (
+        dataset_size *
+        base_time_per_sample *
+        feature_multiplier *
+        cv_multiplier *
+        tuning_multiplier *
+        model_multiplier
+    )
+    # Add base overhead
+    total_seconds += 10  # Base overhead
+    # Format time
+    if total_seconds < 60:
+        return f"{int(total_seconds)} seconds"
+    elif total_seconds < 3600:
+        minutes = int(total_seconds // 60)
+        seconds = int(total_seconds % 60)
+        return f"{minutes}:{seconds:02d}"
+    else:
+        hours = int(total_seconds // 3600)
+        minutes = int((total_seconds % 3600) // 60)
+        return f"{hours}:{minutes:02d}:00"
 def estimate_training_time_streamlit(dataset_size: int) -> dict:
     """Estimate training time for Streamlit display"""
     if estimate_training_time:
             st.plotly_chart(fig_labels, use_container_width=True)
     # Training configuration
+    with st.expander("⚙️ Training Configuration", expanded=True):
+        st.markdown("**Configure your training parameters:**")
         col1, col2 = st.columns(2)
         with col1:
+            st.markdown("##### Core Settings")
+            # Test size slider
+            test_size = st.slider(
+                "Test Set Size (%)",
+                min_value=10,
+                max_value=50,
+                value=20,
+                step=5,
+                help="Percentage of data reserved for testing"
+            )
+            # Cross-validation folds
+            cv_folds = st.slider(
+                "Cross-Validation Folds",
+                min_value=2,
+                max_value=10,
+                value=3 if dataset_size < 100 else 5,
+                step=1,
+                help="Number of folds for cross-validation"
+            )
+            # Hyperparameter tuning toggle
+            enable_tuning = st.checkbox(
+                "Enable Hyperparameter Tuning",
+                value=dataset_size >= 50,
+                help="Enable grid search for optimal parameters (recommended for 50+ samples)"
+            )
         with col2:
+            st.markdown("##### Advanced Options")
+            # Model selection
+            available_models = st.multiselect(
+                "Models to Train",
+                options=["Logistic Regression", "Random Forest"],
+                default=["Logistic Regression"] if dataset_size < 50 else ["Logistic Regression", "Random Forest"],
+                help="Select which models to train and compare"
+            )
+            # Feature engineering options
+            max_features = st.selectbox(
+                "Max TF-IDF Features",
+                options=[1000, 2000, 5000, 10000, 20000],
+                index=2 if dataset_size >= 100 else 1,
+                help="Maximum number of TF-IDF features to extract"
+            )
+            # N-gram range
+            ngram_option = st.selectbox(
+                "N-gram Range",
+                options=["Unigrams (1,1)", "Unigrams + Bigrams (1,2)", "Unigrams + Bigrams + Trigrams (1,3)"],
+                index=1,
+                help="Range of n-grams to include in feature extraction"
+            )
+        # Convert selections to parameters
+        ngram_map = {
+            "Unigrams (1,1)": (1, 1),
+            "Unigrams + Bigrams (1,2)": (1, 2),
+            "Unigrams + Bigrams + Trigrams (1,3)": (1, 3)
+        }
+        ngram_range = ngram_map[ngram_option]
+        model_map = {
+            "Logistic Regression": "logistic_regression",
+            "Random Forest": "random_forest"
+        }
+        selected_models = [model_map[model] for model in available_models]
+        # Training summary
+        st.markdown("---")
+        st.markdown("##### 📋 Training Summary")
+        summary_col1, summary_col2, summary_col3 = st.columns(3)
+        with summary_col1:
+            st.info(f"**Data Split:** {100-test_size}% train, {test_size}% test")
+            st.info(f"**Cross-Validation:** {cv_folds} folds")
+        with summary_col2:
+            tuning_status = "✅ Enabled" if enable_tuning else "❌ Disabled"
+            st.info(f"**Hyperparameter Tuning:** {tuning_status}")
+            st.info(f"**Models:** {len(selected_models)} selected")
+        with summary_col3:
+            st.info(f"**Max Features:** {max_features:,}")
+            st.info(f"**N-grams:** {ngram_range}")
+        # Warnings and recommendations
+        if dataset_size < 20:
+            st.warning("⚠️ **Very small dataset detected:**")
+            st.warning("• Hyperparameter tuning automatically disabled")
+            st.warning("• Results may be unreliable")
+            st.warning("• Consider using more data for better performance")
+        elif dataset_size < 50:
+            if enable_tuning:
+                st.warning("⚠️ **Small dataset with hyperparameter tuning:**")
+                st.warning("• Training may take longer")
+                st.warning("• Risk of overfitting")
+            else:
+                st.info("ℹ️ **Small dataset - good configuration**")
+        else:
+            if not enable_tuning:
+                st.info("ℹ️ **Large dataset without hyperparameter tuning:**")
+                st.info("• Training will be faster")
+                st.info("• Consider enabling tuning for better performance")
+            else:
+                st.success("✅ **Optimal configuration for your dataset size**")
+        # Estimated training time with new parameters
+        estimated_time = estimate_detailed_training_time(
+            dataset_size, enable_tuning, cv_folds, len(selected_models), max_features
+        )
+        st.markdown("---")
+        st.markdown(f"##### ⏱️ **Estimated Training Time: {estimated_time}**")
     # Training button and execution
     if st.button("🏃‍♂️ Start Training", type="primary", use_container_width=True):
+        # Validate configuration
+        if not selected_models:
+            st.error("❌ Please select at least one model to train!")
+            return
+        if dataset_size < 6:
+            st.error("❌ Dataset too small! Minimum 6 samples required.")
+            return
+        # Save training data with metadata
         app_manager.paths['custom_data'].parent.mkdir(parents=True, exist_ok=True)
         df_train.to_csv(app_manager.paths['custom_data'], index=False)
+        # Save training configuration
+        training_config = {
+            'test_size': test_size / 100,  # Convert percentage to decimal
+            'cv_folds': cv_folds,
+            'enable_tuning': enable_tuning,
+            'selected_models': selected_models,
+            'max_features': max_features,
+            'ngram_range': ngram_range,
+            'dataset_size': dataset_size
+        }
+        config_path = Path("/tmp/training_config.json")
+        with open(config_path, 'w') as f:
+            json.dump(training_config, f, indent=2)
         st.markdown("---")
         st.markdown("### 🔄 Training Progress")
+        # Show final configuration
+        st.info(f"🎯 **Training Configuration:** {len(selected_models)} model(s), "
+               f"{test_size}% test split, {cv_folds}-fold CV, "
+               f"{'with' if enable_tuning else 'without'} hyperparameter tuning")
         # Progress containers
         progress_col1, progress_col2 = st.columns([3, 1])
         if DIRECT_TRAINING_AVAILABLE:
             # Method 1: Direct function call (shows progress in real-time)
+            status_text.text("Status: Initializing training with custom config...")
             progress_bar.progress(5)
             try:
                 # Create output capture
                 output_buffer = io.StringIO()
+                with st.spinner("Training model with custom configuration..."):
+                    # Create trainer with custom config
+                    trainer = RobustModelTrainer()
+                    # Apply custom configuration
+                    trainer.test_size = training_config['test_size']
+                    trainer.cv_folds = training_config['cv_folds']
+                    trainer.max_features = training_config['max_features']
+                    trainer.ngram_range = training_config['ngram_range']
+                    # Filter models based on selection
+                    if len(selected_models) < len(trainer.models):
+                        all_models = trainer.models.copy()
+                        trainer.models = {k: v for k, v in all_models.items() if k in selected_models}
                     # Redirect stdout to capture progress
                     with contextlib.redirect_stdout(output_buffer):
                         success, message = trainer.train_model(
                             data_path=str(app_manager.paths['custom_data'])
                         )
                     st.success("🎉 **Training Completed Successfully!**")
                     st.info(f"📊 **{message}**")
+                    # Show configuration used
+                    with st.expander("⚙️ Configuration Used"):
+                        st.json(training_config)
                     # Show captured progress if available
                     if captured_output:
                         with st.expander("📈 Training Progress Details"):
             progress_bar.progress(10)
             try:
+                # Calculate progress steps based on configuration
+                num_steps = len(selected_models) * (8 if enable_tuning else 2) * cv_folds
                 progress_steps = [
                     (20, "Loading and validating data..."),
+                    (30, f"Configuring {len(selected_models)} model(s)..."),
+                    (50, f"Training with {cv_folds}-fold cross-validation..."),
+                    (70, "Performing hyperparameter tuning..." if enable_tuning else "Training models..."),
+                    (85, "Evaluating performance..."),
                     (95, "Saving model artifacts...")
                 ]
+                # Start subprocess with config
                 process = subprocess.Popen(
+                    [sys.executable, "model/train.py",
+                     "--data_path", str(app_manager.paths['custom_data']),
+                     "--config_path", str(config_path)],
                     stdout=subprocess.PIPE,
                     stderr=subprocess.STDOUT,
                     universal_newlines=True
                     elapsed = time.time() - start_time
                     time_display.text(f"Elapsed: {timedelta(seconds=int(elapsed))}")
+                    # Update progress based on elapsed time and configuration
                     if step_idx < len(progress_steps):
+                        expected_time = dataset_size * 0.05 * (2 if enable_tuning else 1)
                         if elapsed > expected_time * (step_idx + 1) / len(progress_steps):
                             progress, status = progress_steps[step_idx]
                             progress_bar.progress(progress)
                 if process.returncode == 0:
                     st.success("🎉 **Training Completed Successfully!**")
+                    # Show configuration used
+                    with st.expander("⚙️ Configuration Used"):
+                        st.json(training_config)
                     # Extract performance info from output
                     if stdout:
                         lines = stdout.strip().split('\n')
                 else:
                     st.error("❌ **Training Failed**")
+                    with st.expander("🔍 Error Details"):
+                        st.code(stdout)
             except Exception as e:
                 st.error(f"❌ **Training Error:** {str(e)}")