Spaces:

siddharth786
/

email-pii-classifier-v2

Runtime error

App Files Files Community

email-pii-classifier-v2 / train.py

siddharth786

Add training script, train model, and save pipeline

0d541e6 3 months ago

raw

history blame contribute delete

4.25 kB

	# filepath: /workspaces/internship1/train.py
	import pandas as pd
	import joblib
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import Pipeline
	from pathlib import Path

	# --- Local Imports ---
	# Ensure utils.py has the clean_text_for_classification function
	try:
	from utils import clean_text_for_classification
	except ImportError:
	print("Error: Could not import clean_text_for_classification from utils.")
	print("Make sure utils.py exists and the function is defined.")
	# Define a basic fallback if needed for testing, but fix the import
	def clean_text_for_classification(text: str) -> str:
	return text.lower().strip()

	# --- Configuration ---
	# !! ADJUST THESE PATHS AND COLUMN NAMES !!
	DATASET_PATH = Path("combined_emails_with_natural_pii.csv")
	MODEL_DIR = Path("saved_models")
	MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
	email_body_column = 'email' # <<< Ensure this is 'email'
	category_column = 'type' # <<< Ensure this is 'type'

	# --- Main Training Function ---
	def train_model(data_path: Path, model_save_path: Path):
	"""Loads data, trains the model pipeline, and saves it."""

	if not data_path.exists():
	print(f"Error: Dataset not found at {data_path}")
	print("Please make sure the CSV file is uploaded to your Codespace.")
	return

	print(f"Loading dataset from {data_path}...")
	try:
	# Keep the on_bad_lines='skip' if it worked
	df = pd.read_csv(data_path, engine='python', on_bad_lines='skip')
	print(f"Dataset loaded. Note: Bad lines may have been skipped.")
	except Exception as e:
	print(f"Error loading CSV: {e}")
	return

	# --- Data Validation ---
	if email_body_column not in df.columns:
	print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
	print(f"Available columns: {df.columns.tolist()}")
	return
	if category_column not in df.columns:
	print(f"Error: Category column '{category_column}' not found in the dataset.")
	print(f"Available columns: {df.columns.tolist()}")
	return

	# Handle potential missing values
	df.dropna(subset=[email_body_column, category_column], inplace=True)
	if df.empty:
	print("Error: No valid data remaining after handling missing values.")
	return

	print("Applying text cleaning...")
	# Ensure the cleaning function exists and works
	try:
	df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
	except Exception as e:
	print(f"Error during text cleaning: {e}")
	return

	print("Splitting data...")
	X = df['cleaned_text']
	y = df[category_column]
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
	)

	# --- Model Pipeline ---
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
	('clf', MultinomialNB()) # Using Naive Bayes as a starting point
	])

	print("Training model...")
	try:
	pipeline.fit(X_train, y_train)
	print("Training complete.")
	except Exception as e:
	print(f"Error during model training: {e}")
	return

	# --- Evaluation ---
	try:
	accuracy = pipeline.score(X_test, y_test)
	print(f"Model Accuracy on Test Set: {accuracy:.4f}")
	except Exception as e:
	print(f"Error during model evaluation: {e}")


	# --- Save Model ---
	print(f"Saving model pipeline to {model_save_path}...")
	model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
	try:
	joblib.dump(pipeline, model_save_path)
	print("Model pipeline saved successfully.")
	except Exception as e:
	print(f"Error saving model pipeline: {e}")


	# --- Script Execution ---
	if __name__ == "__main__":
	# Make sure the MODEL_DIR exists before calling train_model if needed elsewhere
	MODEL_DIR.mkdir(parents=True, exist_ok=True)
	train_model(DATASET_PATH, MODEL_PATH)