Ahmedik95316 commited on
Commit
ce7aca5
Β·
1 Parent(s): 04e5963

Update app/streamlit_app.py

Browse files
Files changed (1) hide show
  1. app/streamlit_app.py +233 -35
app/streamlit_app.py CHANGED
@@ -281,6 +281,50 @@ def create_prediction_history_chart():
281
  return fig
282
 
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  def estimate_training_time_streamlit(dataset_size: int) -> dict:
285
  """Estimate training time for Streamlit display"""
286
  if estimate_training_time:
@@ -351,42 +395,170 @@ def render_enhanced_training_section(df_train):
351
  st.plotly_chart(fig_labels, use_container_width=True)
352
 
353
  # Training configuration
354
- with st.expander("βš™οΈ Training Configuration"):
 
 
355
  col1, col2 = st.columns(2)
356
 
357
  with col1:
358
- if dataset_size < 20:
359
- st.warning("⚠️ Very small dataset: Hyperparameter tuning will be skipped")
360
- st.info("β€’ Simple training only")
361
- st.info("β€’ Minimal cross-validation")
362
- elif dataset_size < 50:
363
- st.info("ℹ️ Small dataset: Limited hyperparameter tuning")
364
- st.info("β€’ Reduced parameter grids")
365
- st.info("β€’ 2-3 fold cross-validation")
366
- else:
367
- st.success("βœ… Standard dataset: Full training pipeline")
368
- st.info("β€’ Complete hyperparameter tuning")
369
- st.info("β€’ 3-fold cross-validation")
370
- st.info("β€’ Model comparison")
371
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  with col2:
373
- st.write("**Expected Features:**")
374
- st.write(f"β€’ TF-IDF vectorization")
375
- st.write(f"β€’ Feature selection")
376
- st.write(f"β€’ Logistic Regression")
377
- if dataset_size >= 50:
378
- st.write(f"β€’ Random Forest comparison")
379
- st.write(f"β€’ Performance evaluation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
  # Training button and execution
382
  if st.button("πŸƒβ€β™‚οΈ Start Training", type="primary", use_container_width=True):
383
- # Save training data
 
 
 
 
 
 
 
 
 
384
  app_manager.paths['custom_data'].parent.mkdir(parents=True, exist_ok=True)
385
  df_train.to_csv(app_manager.paths['custom_data'], index=False)
386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  st.markdown("---")
388
  st.markdown("### πŸ”„ Training Progress")
389
 
 
 
 
 
 
390
  # Progress containers
391
  progress_col1, progress_col2 = st.columns([3, 1])
392
 
@@ -402,17 +574,30 @@ def render_enhanced_training_section(df_train):
402
 
403
  if DIRECT_TRAINING_AVAILABLE:
404
  # Method 1: Direct function call (shows progress in real-time)
405
- status_text.text("Status: Initializing direct training...")
406
  progress_bar.progress(5)
407
 
408
  try:
409
  # Create output capture
410
  output_buffer = io.StringIO()
411
 
412
- with st.spinner("Training model (direct method)..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  # Redirect stdout to capture progress
414
  with contextlib.redirect_stdout(output_buffer):
415
- trainer = RobustModelTrainer()
416
  success, message = trainer.train_model(
417
  data_path=str(app_manager.paths['custom_data'])
418
  )
@@ -431,6 +616,10 @@ def render_enhanced_training_section(df_train):
431
  st.success("πŸŽ‰ **Training Completed Successfully!**")
432
  st.info(f"πŸ“Š **{message}**")
433
 
 
 
 
 
434
  # Show captured progress if available
435
  if captured_output:
436
  with st.expander("πŸ“ˆ Training Progress Details"):
@@ -451,18 +640,22 @@ def render_enhanced_training_section(df_train):
451
  progress_bar.progress(10)
452
 
453
  try:
454
- # Simulate progress during subprocess execution
 
455
  progress_steps = [
456
  (20, "Loading and validating data..."),
457
- (40, "Creating preprocessing pipeline..."),
458
- (60, "Training models..."),
459
- (80, "Evaluating performance..."),
 
460
  (95, "Saving model artifacts...")
461
  ]
462
 
463
- # Start subprocess
464
  process = subprocess.Popen(
465
- [sys.executable, "model/train.py", "--data_path", str(app_manager.paths['custom_data'])],
 
 
466
  stdout=subprocess.PIPE,
467
  stderr=subprocess.STDOUT,
468
  universal_newlines=True
@@ -474,9 +667,9 @@ def render_enhanced_training_section(df_train):
474
  elapsed = time.time() - start_time
475
  time_display.text(f"Elapsed: {timedelta(seconds=int(elapsed))}")
476
 
477
- # Update progress based on elapsed time
478
  if step_idx < len(progress_steps):
479
- expected_time = dataset_size * 0.1 # Rough estimate
480
  if elapsed > expected_time * (step_idx + 1) / len(progress_steps):
481
  progress, status = progress_steps[step_idx]
482
  progress_bar.progress(progress)
@@ -498,6 +691,10 @@ def render_enhanced_training_section(df_train):
498
  if process.returncode == 0:
499
  st.success("πŸŽ‰ **Training Completed Successfully!**")
500
 
 
 
 
 
501
  # Extract performance info from output
502
  if stdout:
503
  lines = stdout.strip().split('\n')
@@ -514,7 +711,8 @@ def render_enhanced_training_section(df_train):
514
 
515
  else:
516
  st.error("❌ **Training Failed**")
517
- st.code(stdout)
 
518
 
519
  except Exception as e:
520
  st.error(f"❌ **Training Error:** {str(e)}")
 
281
  return fig
282
 
283
 
284
+ def estimate_detailed_training_time(dataset_size: int, enable_tuning: bool, cv_folds: int, num_models: int, max_features: int) -> str:
285
+ """Estimate training time based on detailed parameters"""
286
+
287
+ # Base time per sample (in seconds)
288
+ base_time_per_sample = 0.01
289
+
290
+ # Feature complexity multiplier
291
+ feature_multiplier = max_features / 5000 # Normalized to 5000 features
292
+
293
+ # Cross-validation multiplier
294
+ cv_multiplier = cv_folds
295
+
296
+ # Hyperparameter tuning multiplier
297
+ tuning_multiplier = 8 if enable_tuning else 1
298
+
299
+ # Model count multiplier
300
+ model_multiplier = num_models
301
+
302
+ # Calculate total time
303
+ total_seconds = (
304
+ dataset_size *
305
+ base_time_per_sample *
306
+ feature_multiplier *
307
+ cv_multiplier *
308
+ tuning_multiplier *
309
+ model_multiplier
310
+ )
311
+
312
+ # Add base overhead
313
+ total_seconds += 10 # Base overhead
314
+
315
+ # Format time
316
+ if total_seconds < 60:
317
+ return f"{int(total_seconds)} seconds"
318
+ elif total_seconds < 3600:
319
+ minutes = int(total_seconds // 60)
320
+ seconds = int(total_seconds % 60)
321
+ return f"{minutes}:{seconds:02d}"
322
+ else:
323
+ hours = int(total_seconds // 3600)
324
+ minutes = int((total_seconds % 3600) // 60)
325
+ return f"{hours}:{minutes:02d}:00"
326
+
327
+
328
  def estimate_training_time_streamlit(dataset_size: int) -> dict:
329
  """Estimate training time for Streamlit display"""
330
  if estimate_training_time:
 
395
  st.plotly_chart(fig_labels, use_container_width=True)
396
 
397
  # Training configuration
398
+ with st.expander("βš™οΈ Training Configuration", expanded=True):
399
+ st.markdown("**Configure your training parameters:**")
400
+
401
  col1, col2 = st.columns(2)
402
 
403
  with col1:
404
+ st.markdown("##### Core Settings")
405
+
406
+ # Test size slider
407
+ test_size = st.slider(
408
+ "Test Set Size (%)",
409
+ min_value=10,
410
+ max_value=50,
411
+ value=20,
412
+ step=5,
413
+ help="Percentage of data reserved for testing"
414
+ )
415
+
416
+ # Cross-validation folds
417
+ cv_folds = st.slider(
418
+ "Cross-Validation Folds",
419
+ min_value=2,
420
+ max_value=10,
421
+ value=3 if dataset_size < 100 else 5,
422
+ step=1,
423
+ help="Number of folds for cross-validation"
424
+ )
425
+
426
+ # Hyperparameter tuning toggle
427
+ enable_tuning = st.checkbox(
428
+ "Enable Hyperparameter Tuning",
429
+ value=dataset_size >= 50,
430
+ help="Enable grid search for optimal parameters (recommended for 50+ samples)"
431
+ )
432
+
433
  with col2:
434
+ st.markdown("##### Advanced Options")
435
+
436
+ # Model selection
437
+ available_models = st.multiselect(
438
+ "Models to Train",
439
+ options=["Logistic Regression", "Random Forest"],
440
+ default=["Logistic Regression"] if dataset_size < 50 else ["Logistic Regression", "Random Forest"],
441
+ help="Select which models to train and compare"
442
+ )
443
+
444
+ # Feature engineering options
445
+ max_features = st.selectbox(
446
+ "Max TF-IDF Features",
447
+ options=[1000, 2000, 5000, 10000, 20000],
448
+ index=2 if dataset_size >= 100 else 1,
449
+ help="Maximum number of TF-IDF features to extract"
450
+ )
451
+
452
+ # N-gram range
453
+ ngram_option = st.selectbox(
454
+ "N-gram Range",
455
+ options=["Unigrams (1,1)", "Unigrams + Bigrams (1,2)", "Unigrams + Bigrams + Trigrams (1,3)"],
456
+ index=1,
457
+ help="Range of n-grams to include in feature extraction"
458
+ )
459
+
460
+ # Convert selections to parameters
461
+ ngram_map = {
462
+ "Unigrams (1,1)": (1, 1),
463
+ "Unigrams + Bigrams (1,2)": (1, 2),
464
+ "Unigrams + Bigrams + Trigrams (1,3)": (1, 3)
465
+ }
466
+ ngram_range = ngram_map[ngram_option]
467
+
468
+ model_map = {
469
+ "Logistic Regression": "logistic_regression",
470
+ "Random Forest": "random_forest"
471
+ }
472
+ selected_models = [model_map[model] for model in available_models]
473
+
474
+ # Training summary
475
+ st.markdown("---")
476
+ st.markdown("##### πŸ“‹ Training Summary")
477
+
478
+ summary_col1, summary_col2, summary_col3 = st.columns(3)
479
+
480
+ with summary_col1:
481
+ st.info(f"**Data Split:** {100-test_size}% train, {test_size}% test")
482
+ st.info(f"**Cross-Validation:** {cv_folds} folds")
483
+
484
+ with summary_col2:
485
+ tuning_status = "βœ… Enabled" if enable_tuning else "❌ Disabled"
486
+ st.info(f"**Hyperparameter Tuning:** {tuning_status}")
487
+ st.info(f"**Models:** {len(selected_models)} selected")
488
+
489
+ with summary_col3:
490
+ st.info(f"**Max Features:** {max_features:,}")
491
+ st.info(f"**N-grams:** {ngram_range}")
492
+
493
+ # Warnings and recommendations
494
+ if dataset_size < 20:
495
+ st.warning("⚠️ **Very small dataset detected:**")
496
+ st.warning("β€’ Hyperparameter tuning automatically disabled")
497
+ st.warning("β€’ Results may be unreliable")
498
+ st.warning("β€’ Consider using more data for better performance")
499
+
500
+ elif dataset_size < 50:
501
+ if enable_tuning:
502
+ st.warning("⚠️ **Small dataset with hyperparameter tuning:**")
503
+ st.warning("β€’ Training may take longer")
504
+ st.warning("β€’ Risk of overfitting")
505
+ else:
506
+ st.info("ℹ️ **Small dataset - good configuration**")
507
+
508
+ else:
509
+ if not enable_tuning:
510
+ st.info("ℹ️ **Large dataset without hyperparameter tuning:**")
511
+ st.info("β€’ Training will be faster")
512
+ st.info("β€’ Consider enabling tuning for better performance")
513
+ else:
514
+ st.success("βœ… **Optimal configuration for your dataset size**")
515
+
516
+ # Estimated training time with new parameters
517
+ estimated_time = estimate_detailed_training_time(
518
+ dataset_size, enable_tuning, cv_folds, len(selected_models), max_features
519
+ )
520
+
521
+ st.markdown("---")
522
+ st.markdown(f"##### ⏱️ **Estimated Training Time: {estimated_time}**")
523
 
524
  # Training button and execution
525
  if st.button("πŸƒβ€β™‚οΈ Start Training", type="primary", use_container_width=True):
526
+ # Validate configuration
527
+ if not selected_models:
528
+ st.error("❌ Please select at least one model to train!")
529
+ return
530
+
531
+ if dataset_size < 6:
532
+ st.error("❌ Dataset too small! Minimum 6 samples required.")
533
+ return
534
+
535
+ # Save training data with metadata
536
  app_manager.paths['custom_data'].parent.mkdir(parents=True, exist_ok=True)
537
  df_train.to_csv(app_manager.paths['custom_data'], index=False)
538
 
539
+ # Save training configuration
540
+ training_config = {
541
+ 'test_size': test_size / 100, # Convert percentage to decimal
542
+ 'cv_folds': cv_folds,
543
+ 'enable_tuning': enable_tuning,
544
+ 'selected_models': selected_models,
545
+ 'max_features': max_features,
546
+ 'ngram_range': ngram_range,
547
+ 'dataset_size': dataset_size
548
+ }
549
+
550
+ config_path = Path("/tmp/training_config.json")
551
+ with open(config_path, 'w') as f:
552
+ json.dump(training_config, f, indent=2)
553
+
554
  st.markdown("---")
555
  st.markdown("### πŸ”„ Training Progress")
556
 
557
+ # Show final configuration
558
+ st.info(f"🎯 **Training Configuration:** {len(selected_models)} model(s), "
559
+ f"{test_size}% test split, {cv_folds}-fold CV, "
560
+ f"{'with' if enable_tuning else 'without'} hyperparameter tuning")
561
+
562
  # Progress containers
563
  progress_col1, progress_col2 = st.columns([3, 1])
564
 
 
574
 
575
  if DIRECT_TRAINING_AVAILABLE:
576
  # Method 1: Direct function call (shows progress in real-time)
577
+ status_text.text("Status: Initializing training with custom config...")
578
  progress_bar.progress(5)
579
 
580
  try:
581
  # Create output capture
582
  output_buffer = io.StringIO()
583
 
584
+ with st.spinner("Training model with custom configuration..."):
585
+ # Create trainer with custom config
586
+ trainer = RobustModelTrainer()
587
+
588
+ # Apply custom configuration
589
+ trainer.test_size = training_config['test_size']
590
+ trainer.cv_folds = training_config['cv_folds']
591
+ trainer.max_features = training_config['max_features']
592
+ trainer.ngram_range = training_config['ngram_range']
593
+
594
+ # Filter models based on selection
595
+ if len(selected_models) < len(trainer.models):
596
+ all_models = trainer.models.copy()
597
+ trainer.models = {k: v for k, v in all_models.items() if k in selected_models}
598
+
599
  # Redirect stdout to capture progress
600
  with contextlib.redirect_stdout(output_buffer):
 
601
  success, message = trainer.train_model(
602
  data_path=str(app_manager.paths['custom_data'])
603
  )
 
616
  st.success("πŸŽ‰ **Training Completed Successfully!**")
617
  st.info(f"πŸ“Š **{message}**")
618
 
619
+ # Show configuration used
620
+ with st.expander("βš™οΈ Configuration Used"):
621
+ st.json(training_config)
622
+
623
  # Show captured progress if available
624
  if captured_output:
625
  with st.expander("πŸ“ˆ Training Progress Details"):
 
640
  progress_bar.progress(10)
641
 
642
  try:
643
+ # Calculate progress steps based on configuration
644
+ num_steps = len(selected_models) * (8 if enable_tuning else 2) * cv_folds
645
  progress_steps = [
646
  (20, "Loading and validating data..."),
647
+ (30, f"Configuring {len(selected_models)} model(s)..."),
648
+ (50, f"Training with {cv_folds}-fold cross-validation..."),
649
+ (70, "Performing hyperparameter tuning..." if enable_tuning else "Training models..."),
650
+ (85, "Evaluating performance..."),
651
  (95, "Saving model artifacts...")
652
  ]
653
 
654
+ # Start subprocess with config
655
  process = subprocess.Popen(
656
+ [sys.executable, "model/train.py",
657
+ "--data_path", str(app_manager.paths['custom_data']),
658
+ "--config_path", str(config_path)],
659
  stdout=subprocess.PIPE,
660
  stderr=subprocess.STDOUT,
661
  universal_newlines=True
 
667
  elapsed = time.time() - start_time
668
  time_display.text(f"Elapsed: {timedelta(seconds=int(elapsed))}")
669
 
670
+ # Update progress based on elapsed time and configuration
671
  if step_idx < len(progress_steps):
672
+ expected_time = dataset_size * 0.05 * (2 if enable_tuning else 1)
673
  if elapsed > expected_time * (step_idx + 1) / len(progress_steps):
674
  progress, status = progress_steps[step_idx]
675
  progress_bar.progress(progress)
 
691
  if process.returncode == 0:
692
  st.success("πŸŽ‰ **Training Completed Successfully!**")
693
 
694
+ # Show configuration used
695
+ with st.expander("βš™οΈ Configuration Used"):
696
+ st.json(training_config)
697
+
698
  # Extract performance info from output
699
  if stdout:
700
  lines = stdout.strip().split('\n')
 
711
 
712
  else:
713
  st.error("❌ **Training Failed**")
714
+ with st.expander("πŸ” Error Details"):
715
+ st.code(stdout)
716
 
717
  except Exception as e:
718
  st.error(f"❌ **Training Error:** {str(e)}")