File size: 4,245 Bytes
0d541e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# filepath: /workspaces/internship1/train.py
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from pathlib import Path

# --- Local Imports ---
# Ensure utils.py has the clean_text_for_classification function
try:
    from utils import clean_text_for_classification
except ImportError:
    print("Error: Could not import clean_text_for_classification from utils.")
    print("Make sure utils.py exists and the function is defined.")
    # Define a basic fallback if needed for testing, but fix the import
    def clean_text_for_classification(text: str) -> str:
        return text.lower().strip()

# --- Configuration ---
# !! ADJUST THESE PATHS AND COLUMN NAMES !!
DATASET_PATH = Path("combined_emails_with_natural_pii.csv")
MODEL_DIR = Path("saved_models")
MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
email_body_column = 'email'      # <<< Ensure this is 'email'
category_column = 'type'         # <<< Ensure this is 'type'

# --- Main Training Function ---
def train_model(data_path: Path, model_save_path: Path):
    """Loads data, trains the model pipeline, and saves it."""

    if not data_path.exists():
        print(f"Error: Dataset not found at {data_path}")
        print("Please make sure the CSV file is uploaded to your Codespace.")
        return

    print(f"Loading dataset from {data_path}...")
    try:
        # Keep the on_bad_lines='skip' if it worked
        df = pd.read_csv(data_path, engine='python', on_bad_lines='skip')
        print(f"Dataset loaded. Note: Bad lines may have been skipped.")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    # --- Data Validation ---
    if email_body_column not in df.columns:
        print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
        print(f"Available columns: {df.columns.tolist()}")
        return
    if category_column not in df.columns:
        print(f"Error: Category column '{category_column}' not found in the dataset.")
        print(f"Available columns: {df.columns.tolist()}")
        return

    # Handle potential missing values
    df.dropna(subset=[email_body_column, category_column], inplace=True)
    if df.empty:
        print("Error: No valid data remaining after handling missing values.")
        return

    print("Applying text cleaning...")
    # Ensure the cleaning function exists and works
    try:
        df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
    except Exception as e:
        print(f"Error during text cleaning: {e}")
        return

    print("Splitting data...")
    X = df['cleaned_text']
    y = df[category_column]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
    )

    # --- Model Pipeline ---
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
        ('clf', MultinomialNB()) # Using Naive Bayes as a starting point
    ])

    print("Training model...")
    try:
        pipeline.fit(X_train, y_train)
        print("Training complete.")
    except Exception as e:
        print(f"Error during model training: {e}")
        return

    # --- Evaluation ---
    try:
        accuracy = pipeline.score(X_test, y_test)
        print(f"Model Accuracy on Test Set: {accuracy:.4f}")
    except Exception as e:
        print(f"Error during model evaluation: {e}")


    # --- Save Model ---
    print(f"Saving model pipeline to {model_save_path}...")
    model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
    try:
        joblib.dump(pipeline, model_save_path)
        print("Model pipeline saved successfully.")
    except Exception as e:
        print(f"Error saving model pipeline: {e}")


# --- Script Execution ---
if __name__ == "__main__":
    # Make sure the MODEL_DIR exists before calling train_model if needed elsewhere
    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    train_model(DATASET_PATH, MODEL_PATH)