Commit
Β·
34841ba
1
Parent(s):
719d51e
Update model/train.py
Browse files- model/train.py +142 -52
model/train.py
CHANGED
|
@@ -196,21 +196,26 @@ class RobustModelTrainer:
|
|
| 196 |
self.progress_tracker = None
|
| 197 |
|
| 198 |
def setup_paths(self):
|
| 199 |
-
"""Setup all necessary paths"""
|
| 200 |
self.base_dir = Path("/tmp")
|
| 201 |
self.data_dir = self.base_dir / "data"
|
| 202 |
self.model_dir = self.base_dir / "model"
|
| 203 |
self.results_dir = self.base_dir / "results"
|
| 204 |
|
| 205 |
-
# Create directories
|
| 206 |
for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
|
| 207 |
dir_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
# File paths
|
| 210 |
self.data_path = self.data_dir / "combined_dataset.csv"
|
| 211 |
-
self.model_path =
|
| 212 |
-
self.vectorizer_path =
|
| 213 |
-
self.pipeline_path =
|
| 214 |
self.metadata_path = Path("/tmp/metadata.json")
|
| 215 |
self.evaluation_path = self.results_dir / "evaluation_results.json"
|
| 216 |
|
|
@@ -295,9 +300,13 @@ class RobustModelTrainer:
|
|
| 295 |
if len(unique_labels) < 2:
|
| 296 |
return False, None, f"Need at least 2 classes, found: {unique_labels}"
|
| 297 |
|
| 298 |
-
# Check minimum sample size
|
| 299 |
-
if len(df) <
|
| 300 |
-
return False, None, f"Insufficient samples for training: {len(df)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
# Check class balance
|
| 303 |
label_counts = df['label'].value_counts()
|
|
@@ -379,29 +388,39 @@ class RobustModelTrainer:
|
|
| 379 |
cm = confusion_matrix(y_test, y_pred)
|
| 380 |
metrics['confusion_matrix'] = cm.tolist()
|
| 381 |
|
| 382 |
-
#
|
| 383 |
-
if X_train is not None and y_train is not None and len(X_train) >=
|
| 384 |
try:
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
except Exception as e:
|
| 401 |
logger.warning(f"Cross-validation failed: {e}")
|
| 402 |
-
metrics['cv_scores'] =
|
| 403 |
else:
|
| 404 |
-
metrics['cv_scores'] = {'note': 'Skipped for small dataset'}
|
| 405 |
|
| 406 |
# Training accuracy for overfitting detection
|
| 407 |
try:
|
|
@@ -426,11 +445,36 @@ class RobustModelTrainer:
|
|
| 426 |
# Set the model in the pipeline
|
| 427 |
pipeline.set_params(model=self.models[model_name]['model'])
|
| 428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
# Get parameter grid
|
| 430 |
param_grid = self.models[model_name]['param_grid']
|
| 431 |
|
| 432 |
-
#
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
# Create GridSearchCV
|
| 436 |
grid_search = GridSearchCV(
|
|
@@ -439,7 +483,7 @@ class RobustModelTrainer:
|
|
| 439 |
cv=StratifiedKFold(n_splits=cv_folds,
|
| 440 |
shuffle=True, random_state=self.random_state),
|
| 441 |
scoring='f1_weighted',
|
| 442 |
-
n_jobs
|
| 443 |
verbose=0 # Reduce verbosity for speed
|
| 444 |
)
|
| 445 |
|
|
@@ -451,6 +495,7 @@ class RobustModelTrainer:
|
|
| 451 |
'best_params': grid_search.best_params_,
|
| 452 |
'best_score': float(grid_search.best_score_),
|
| 453 |
'best_estimator': grid_search.best_estimator_,
|
|
|
|
| 454 |
'cv_results': {
|
| 455 |
'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
|
| 456 |
'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
|
|
@@ -465,12 +510,15 @@ class RobustModelTrainer:
|
|
| 465 |
return grid_search.best_estimator_, tuning_results
|
| 466 |
|
| 467 |
except Exception as e:
|
| 468 |
-
logger.error(
|
| 469 |
-
f"Hyperparameter tuning failed for {model_name}: {str(e)}")
|
| 470 |
# Return basic model if tuning fails
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
|
| 476 |
"""Train and evaluate multiple models"""
|
|
@@ -543,23 +591,36 @@ class RobustModelTrainer:
|
|
| 543 |
return best_model_name, best_model, best_metrics
|
| 544 |
|
| 545 |
def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
|
| 546 |
-
"""Save model artifacts and metadata"""
|
| 547 |
try:
|
| 548 |
if self.progress_tracker:
|
| 549 |
self.progress_tracker.update("Saving model")
|
| 550 |
|
| 551 |
-
# Save the full pipeline
|
| 552 |
-
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
# Save individual components for backward compatibility
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
|
|
|
|
|
|
|
|
|
| 563 |
|
| 564 |
# Generate data hash
|
| 565 |
data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
|
|
@@ -587,16 +648,27 @@ class RobustModelTrainer:
|
|
| 587 |
}
|
| 588 |
}
|
| 589 |
|
| 590 |
-
# Save metadata
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
logger.info(f"β
Model artifacts saved successfully")
|
| 595 |
return True
|
| 596 |
|
| 597 |
except Exception as e:
|
| 598 |
logger.error(f"Failed to save model artifacts: {str(e)}")
|
| 599 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
def train_model(self, data_path: str = None) -> Tuple[bool, str]:
|
| 602 |
"""Main training function with comprehensive pipeline"""
|
|
@@ -634,16 +706,34 @@ class RobustModelTrainer:
|
|
| 634 |
X = df['text'].values
|
| 635 |
y = df['label'].values
|
| 636 |
|
| 637 |
-
# Train-test split
|
| 638 |
self.progress_tracker.update("Splitting data")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 640 |
X, y,
|
| 641 |
-
test_size=
|
| 642 |
-
stratify=y if
|
| 643 |
random_state=self.random_state
|
| 644 |
)
|
| 645 |
|
| 646 |
logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
|
| 648 |
# Train and evaluate models
|
| 649 |
results = self.train_and_evaluate_models(
|
|
|
|
| 196 |
self.progress_tracker = None
|
| 197 |
|
| 198 |
def setup_paths(self):
|
| 199 |
+
"""Setup all necessary paths with proper permissions"""
|
| 200 |
self.base_dir = Path("/tmp")
|
| 201 |
self.data_dir = self.base_dir / "data"
|
| 202 |
self.model_dir = self.base_dir / "model"
|
| 203 |
self.results_dir = self.base_dir / "results"
|
| 204 |
|
| 205 |
+
# Create directories with proper permissions
|
| 206 |
for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
|
| 207 |
dir_path.mkdir(parents=True, exist_ok=True)
|
| 208 |
+
# Ensure write permissions
|
| 209 |
+
try:
|
| 210 |
+
dir_path.chmod(0o755)
|
| 211 |
+
except:
|
| 212 |
+
pass
|
| 213 |
|
| 214 |
# File paths
|
| 215 |
self.data_path = self.data_dir / "combined_dataset.csv"
|
| 216 |
+
self.model_path = Path("/tmp/model.pkl") # Direct path to avoid permission issues
|
| 217 |
+
self.vectorizer_path = Path("/tmp/vectorizer.pkl")
|
| 218 |
+
self.pipeline_path = Path("/tmp/pipeline.pkl")
|
| 219 |
self.metadata_path = Path("/tmp/metadata.json")
|
| 220 |
self.evaluation_path = self.results_dir / "evaluation_results.json"
|
| 221 |
|
|
|
|
| 300 |
if len(unique_labels) < 2:
|
| 301 |
return False, None, f"Need at least 2 classes, found: {unique_labels}"
|
| 302 |
|
| 303 |
+
# Check minimum sample size - more lenient
|
| 304 |
+
if len(df) < 6:
|
| 305 |
+
return False, None, f"Insufficient samples for training: {len(df)} (minimum: 6)"
|
| 306 |
+
|
| 307 |
+
# Warning for small datasets
|
| 308 |
+
if len(df) < 50:
|
| 309 |
+
logger.warning(f"Small dataset detected: {len(df)} samples. Results may be unreliable.")
|
| 310 |
|
| 311 |
# Check class balance
|
| 312 |
label_counts = df['label'].value_counts()
|
|
|
|
| 388 |
cm = confusion_matrix(y_test, y_pred)
|
| 389 |
metrics['confusion_matrix'] = cm.tolist()
|
| 390 |
|
| 391 |
+
# Smart cross-validation based on dataset size
|
| 392 |
+
if X_train is not None and y_train is not None and len(X_train) >= 20:
|
| 393 |
try:
|
| 394 |
+
# Calculate appropriate CV folds for small datasets
|
| 395 |
+
n_samples = len(X_train)
|
| 396 |
+
min_samples_per_fold = 3 # Minimum samples per fold
|
| 397 |
+
max_folds = n_samples // min_samples_per_fold
|
| 398 |
+
cv_folds = max(2, min(self.cv_folds, max_folds))
|
| 399 |
+
|
| 400 |
+
if cv_folds >= 2:
|
| 401 |
+
cv_scores = cross_val_score(
|
| 402 |
+
model, X_train, y_train,
|
| 403 |
+
cv=StratifiedKFold(
|
| 404 |
+
n_splits=cv_folds,
|
| 405 |
+
shuffle=True,
|
| 406 |
+
random_state=self.random_state
|
| 407 |
+
),
|
| 408 |
+
scoring='f1_weighted',
|
| 409 |
+
n_jobs=1 # Single job for small datasets
|
| 410 |
+
)
|
| 411 |
+
metrics['cv_scores'] = {
|
| 412 |
+
'mean': float(cv_scores.mean()),
|
| 413 |
+
'std': float(cv_scores.std()),
|
| 414 |
+
'scores': cv_scores.tolist(),
|
| 415 |
+
'folds_used': cv_folds
|
| 416 |
+
}
|
| 417 |
+
else:
|
| 418 |
+
metrics['cv_scores'] = {'note': 'Dataset too small for reliable CV'}
|
| 419 |
except Exception as e:
|
| 420 |
logger.warning(f"Cross-validation failed: {e}")
|
| 421 |
+
metrics['cv_scores'] = {'note': f'CV failed: {str(e)}'}
|
| 422 |
else:
|
| 423 |
+
metrics['cv_scores'] = {'note': 'Skipped for very small dataset'}
|
| 424 |
|
| 425 |
# Training accuracy for overfitting detection
|
| 426 |
try:
|
|
|
|
| 445 |
# Set the model in the pipeline
|
| 446 |
pipeline.set_params(model=self.models[model_name]['model'])
|
| 447 |
|
| 448 |
+
# Skip hyperparameter tuning for very small datasets
|
| 449 |
+
if len(X_train) < 20:
|
| 450 |
+
logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
|
| 451 |
+
pipeline.fit(X_train, y_train)
|
| 452 |
+
return pipeline, {
|
| 453 |
+
'best_params': 'default_parameters',
|
| 454 |
+
'best_score': 'not_calculated',
|
| 455 |
+
'best_estimator': pipeline,
|
| 456 |
+
'note': 'Hyperparameter tuning skipped for small dataset'
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
# Get parameter grid
|
| 460 |
param_grid = self.models[model_name]['param_grid']
|
| 461 |
|
| 462 |
+
# Calculate appropriate CV folds for small datasets
|
| 463 |
+
n_samples = len(X_train)
|
| 464 |
+
min_samples_per_fold = 3
|
| 465 |
+
max_folds = n_samples // min_samples_per_fold
|
| 466 |
+
cv_folds = max(2, min(self.cv_folds, max_folds))
|
| 467 |
+
|
| 468 |
+
if cv_folds < 2:
|
| 469 |
+
# Fallback to simple training
|
| 470 |
+
logger.info(f"Dataset too small for CV, using simple training for {model_name}")
|
| 471 |
+
pipeline.fit(X_train, y_train)
|
| 472 |
+
return pipeline, {
|
| 473 |
+
'best_params': 'default_parameters',
|
| 474 |
+
'best_score': 'not_calculated',
|
| 475 |
+
'best_estimator': pipeline,
|
| 476 |
+
'note': 'Simple training used due to very small dataset'
|
| 477 |
+
}
|
| 478 |
|
| 479 |
# Create GridSearchCV
|
| 480 |
grid_search = GridSearchCV(
|
|
|
|
| 483 |
cv=StratifiedKFold(n_splits=cv_folds,
|
| 484 |
shuffle=True, random_state=self.random_state),
|
| 485 |
scoring='f1_weighted',
|
| 486 |
+
n_jobs=1, # Single job for small datasets
|
| 487 |
verbose=0 # Reduce verbosity for speed
|
| 488 |
)
|
| 489 |
|
|
|
|
| 495 |
'best_params': grid_search.best_params_,
|
| 496 |
'best_score': float(grid_search.best_score_),
|
| 497 |
'best_estimator': grid_search.best_estimator_,
|
| 498 |
+
'cv_folds_used': cv_folds,
|
| 499 |
'cv_results': {
|
| 500 |
'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
|
| 501 |
'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
|
|
|
|
| 510 |
return grid_search.best_estimator_, tuning_results
|
| 511 |
|
| 512 |
except Exception as e:
|
| 513 |
+
logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}")
|
|
|
|
| 514 |
# Return basic model if tuning fails
|
| 515 |
+
try:
|
| 516 |
+
pipeline.set_params(model=self.models[model_name]['model'])
|
| 517 |
+
pipeline.fit(X_train, y_train)
|
| 518 |
+
return pipeline, {'error': str(e), 'fallback': 'simple_training'}
|
| 519 |
+
except Exception as e2:
|
| 520 |
+
logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
|
| 521 |
+
raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
|
| 522 |
|
| 523 |
def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
|
| 524 |
"""Train and evaluate multiple models"""
|
|
|
|
| 591 |
return best_model_name, best_model, best_metrics
|
| 592 |
|
| 593 |
def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
|
| 594 |
+
"""Save model artifacts and metadata with robust error handling"""
|
| 595 |
try:
|
| 596 |
if self.progress_tracker:
|
| 597 |
self.progress_tracker.update("Saving model")
|
| 598 |
|
| 599 |
+
# Save the full pipeline with error handling
|
| 600 |
+
try:
|
| 601 |
+
joblib.dump(model, self.pipeline_path)
|
| 602 |
+
logger.info(f"β
Saved pipeline to {self.pipeline_path}")
|
| 603 |
+
except Exception as e:
|
| 604 |
+
logger.error(f"Failed to save pipeline: {e}")
|
| 605 |
+
# Try alternative path
|
| 606 |
+
alt_pipeline_path = Path("/tmp") / "pipeline.pkl"
|
| 607 |
+
joblib.dump(model, alt_pipeline_path)
|
| 608 |
+
logger.info(f"β
Saved pipeline to {alt_pipeline_path}")
|
| 609 |
|
| 610 |
# Save individual components for backward compatibility
|
| 611 |
+
try:
|
| 612 |
+
if hasattr(model, 'named_steps') and 'model' in model.named_steps:
|
| 613 |
+
joblib.dump(model.named_steps['model'], self.model_path)
|
| 614 |
+
logger.info(f"β
Saved model to {self.model_path}")
|
| 615 |
+
except Exception as e:
|
| 616 |
+
logger.warning(f"Could not save model component: {e}")
|
| 617 |
|
| 618 |
+
try:
|
| 619 |
+
if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
|
| 620 |
+
joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
|
| 621 |
+
logger.info(f"β
Saved vectorizer to {self.vectorizer_path}")
|
| 622 |
+
except Exception as e:
|
| 623 |
+
logger.warning(f"Could not save vectorizer component: {e}")
|
| 624 |
|
| 625 |
# Generate data hash
|
| 626 |
data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
|
|
|
|
| 648 |
}
|
| 649 |
}
|
| 650 |
|
| 651 |
+
# Save metadata with error handling
|
| 652 |
+
try:
|
| 653 |
+
with open(self.metadata_path, 'w') as f:
|
| 654 |
+
json.dump(metadata, f, indent=2)
|
| 655 |
+
logger.info(f"β
Saved metadata to {self.metadata_path}")
|
| 656 |
+
except Exception as e:
|
| 657 |
+
logger.warning(f"Could not save metadata: {e}")
|
| 658 |
|
| 659 |
logger.info(f"β
Model artifacts saved successfully")
|
| 660 |
return True
|
| 661 |
|
| 662 |
except Exception as e:
|
| 663 |
logger.error(f"Failed to save model artifacts: {str(e)}")
|
| 664 |
+
# Try to save at least the core pipeline
|
| 665 |
+
try:
|
| 666 |
+
joblib.dump(model, Path("/tmp/pipeline_backup.pkl"))
|
| 667 |
+
logger.info("β
Saved backup pipeline")
|
| 668 |
+
return True
|
| 669 |
+
except Exception as e2:
|
| 670 |
+
logger.error(f"Failed to save backup pipeline: {str(e2)}")
|
| 671 |
+
return False
|
| 672 |
|
| 673 |
def train_model(self, data_path: str = None) -> Tuple[bool, str]:
|
| 674 |
"""Main training function with comprehensive pipeline"""
|
|
|
|
| 706 |
X = df['text'].values
|
| 707 |
y = df['label'].values
|
| 708 |
|
| 709 |
+
# Train-test split with smart handling for small datasets
|
| 710 |
self.progress_tracker.update("Splitting data")
|
| 711 |
+
|
| 712 |
+
# Ensure minimum test size for very small datasets
|
| 713 |
+
if len(X) < 10:
|
| 714 |
+
test_size = max(0.1, 1/len(X)) # At least 1 sample for test
|
| 715 |
+
else:
|
| 716 |
+
test_size = self.test_size
|
| 717 |
+
|
| 718 |
+
# Check if stratification is possible
|
| 719 |
+
label_counts = pd.Series(y).value_counts()
|
| 720 |
+
min_class_count = label_counts.min()
|
| 721 |
+
can_stratify = min_class_count >= 2 and len(y) >= 4
|
| 722 |
+
|
| 723 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 724 |
X, y,
|
| 725 |
+
test_size=test_size,
|
| 726 |
+
stratify=y if can_stratify else None,
|
| 727 |
random_state=self.random_state
|
| 728 |
)
|
| 729 |
|
| 730 |
logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
|
| 731 |
+
|
| 732 |
+
# Additional validation for very small datasets
|
| 733 |
+
if len(X_train) < 3:
|
| 734 |
+
logger.warning(f"Very small training set: {len(X_train)} samples. Results may be unreliable.")
|
| 735 |
+
if len(X_test) < 1:
|
| 736 |
+
return False, "Cannot create test set. Dataset too small."
|
| 737 |
|
| 738 |
# Train and evaluate models
|
| 739 |
results = self.train_and_evaluate_models(
|