Spaces:
Runtime error
Runtime error
File size: 4,245 Bytes
0d541e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# filepath: /workspaces/internship1/train.py
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from pathlib import Path
# --- Local Imports ---
# Ensure utils.py has the clean_text_for_classification function
try:
from utils import clean_text_for_classification
except ImportError:
print("Error: Could not import clean_text_for_classification from utils.")
print("Make sure utils.py exists and the function is defined.")
# Define a basic fallback if needed for testing, but fix the import
def clean_text_for_classification(text: str) -> str:
return text.lower().strip()
# --- Configuration ---
# !! ADJUST THESE PATHS AND COLUMN NAMES !!
DATASET_PATH = Path("combined_emails_with_natural_pii.csv")
MODEL_DIR = Path("saved_models")
MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
email_body_column = 'email' # <<< Ensure this is 'email'
category_column = 'type' # <<< Ensure this is 'type'
# --- Main Training Function ---
def train_model(data_path: Path, model_save_path: Path):
"""Loads data, trains the model pipeline, and saves it."""
if not data_path.exists():
print(f"Error: Dataset not found at {data_path}")
print("Please make sure the CSV file is uploaded to your Codespace.")
return
print(f"Loading dataset from {data_path}...")
try:
# Keep the on_bad_lines='skip' if it worked
df = pd.read_csv(data_path, engine='python', on_bad_lines='skip')
print(f"Dataset loaded. Note: Bad lines may have been skipped.")
except Exception as e:
print(f"Error loading CSV: {e}")
return
# --- Data Validation ---
if email_body_column not in df.columns:
print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
print(f"Available columns: {df.columns.tolist()}")
return
if category_column not in df.columns:
print(f"Error: Category column '{category_column}' not found in the dataset.")
print(f"Available columns: {df.columns.tolist()}")
return
# Handle potential missing values
df.dropna(subset=[email_body_column, category_column], inplace=True)
if df.empty:
print("Error: No valid data remaining after handling missing values.")
return
print("Applying text cleaning...")
# Ensure the cleaning function exists and works
try:
df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
except Exception as e:
print(f"Error during text cleaning: {e}")
return
print("Splitting data...")
X = df['cleaned_text']
y = df[category_column]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
)
# --- Model Pipeline ---
pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
('clf', MultinomialNB()) # Using Naive Bayes as a starting point
])
print("Training model...")
try:
pipeline.fit(X_train, y_train)
print("Training complete.")
except Exception as e:
print(f"Error during model training: {e}")
return
# --- Evaluation ---
try:
accuracy = pipeline.score(X_test, y_test)
print(f"Model Accuracy on Test Set: {accuracy:.4f}")
except Exception as e:
print(f"Error during model evaluation: {e}")
# --- Save Model ---
print(f"Saving model pipeline to {model_save_path}...")
model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
try:
joblib.dump(pipeline, model_save_path)
print("Model pipeline saved successfully.")
except Exception as e:
print(f"Error saving model pipeline: {e}")
# --- Script Execution ---
if __name__ == "__main__":
# Make sure the MODEL_DIR exists before calling train_model if needed elsewhere
MODEL_DIR.mkdir(parents=True, exist_ok=True)
train_model(DATASET_PATH, MODEL_PATH) |