Spaces:
Runtime error
Runtime error
File size: 9,291 Bytes
c2d8eab ce67727 0d541e6 b3fad56 c2d8eab ce67727 0d541e6 c2d8eab 0d541e6 23e9906 0d541e6 ce67727 0d541e6 ce67727 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab b20f676 b3fad56 c2d8eab b3fad56 c2d8eab b3fad56 c2d8eab b3fad56 c2d8eab b3fad56 c2d8eab b3fad56 b20f676 b3fad56 b20f676 b3fad56 b20f676 b3fad56 c2d8eab b3fad56 b20f676 b3fad56 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 c2d8eab 0d541e6 74a755c b3fad56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from typing import Tuple, Any, Optional, List, Dict
from pathlib import Path
import re
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import spacy
import pickle
# --- Constants ---
MODEL_DIR = Path("saved_models")
MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
# --- FastAPI App ---
app = FastAPI()
# --- Pydantic Models for Request/Response ---
class EmailInput(BaseModel):
email_body: str
class MaskedEntity(BaseModel):
position: List[int]
classification: str
entity: str
class ClassificationOutput(BaseModel):
input_email_body: str
list_of_masked_entities: List[MaskedEntity]
masked_email: str
category_of_the_email: str
# --- Load Model at Startup ---
# Load the model pipeline once when the application starts
model_pipeline: Optional[Pipeline] = None
# --- Model Loading ---
def load_model_pipeline() -> Optional[Pipeline]:
"""Loads the trained model pipeline."""
model_pipeline = None
if MODEL_PATH.exists():
try:
model_pipeline = joblib.load(MODEL_PATH)
print(f"Model pipeline loaded successfully from {MODEL_PATH}")
except Exception as e:
print(f"Error loading model pipeline from {MODEL_PATH}: {e}")
else:
print(f"Model pipeline not found at {MODEL_PATH}.")
print("Please train and save the model pipeline first.")
return model_pipeline
# --- Text Cleaning Function ---
def clean_text_for_classification(text: str) -> str:
"""Basic text cleaning."""
text = text.lower()
text = re.sub(r'<.*?>', '', text) # Remove HTML tags
text = re.sub(r'[^a-z\s]', '', text) # Remove non-alpha and non-whitespace
text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
return text
# --- Mask PII Function ---
def mask_pii(text: str, nlp_model: spacy.language.Language) -> Tuple[str, List[Dict[str, Any]]]:
"""
Finds and masks PII entities in text using the provided spaCy model.
Args:
text: The input email body.
nlp_model: The loaded spaCy language model.
Returns:
A tuple containing:
- The email body with PII entities replaced by placeholders (e.g., "[full_name]").
- A list of dictionaries, where each dictionary describes a masked entity
(e.g., {"position": [start, end], "classification": "entity_type", "entity": "original_text"}).
"""
print(f"Executing mask_pii for text: '{text[:50]}...'") # Add log
masked_text = text
entities = []
doc = nlp_model(text)
for ent in doc.ents:
if ent.label_ in ["PERSON", "EMAIL"]:
entity_info = {
"position": [ent.start_char, ent.end_char],
"classification": ent.label_.lower(),
"entity": ent.text
}
entities.append(entity_info)
masked_text = masked_text.replace(ent.text, f"[{ent.label_.lower()}]")
print(f"mask_pii result - entities: {entities}") # Add log
return masked_text, entities
# --- Prediction Function ---
def predict_category(text: str, pipeline: Pipeline) -> str:
"""
Predicts the category of the text using the loaded classification pipeline.
Applies cleaning before prediction.
"""
print(f"Executing predict_category for text: '{text[:50]}...'")
try:
# Clean the text first using the function now in this file
cleaned_text = clean_text_for_classification(text)
print(f"Cleaned text for prediction: '{cleaned_text[:50]}...'")
# Assuming the pipeline has a .predict() method
prediction = pipeline.predict([cleaned_text]) # Predict on cleaned text
category = str(prediction[0]) if prediction else "Prediction failed"
except Exception as e:
print(f"Error during prediction: {e}")
category = "Prediction Error"
print(f"predict_category result: {category}")
return category
# --- Training Function ---
def train_model(data_path: Path, model_save_path: Path):
"""Loads data, trains the model pipeline, and saves it."""
if not data_path.exists():
print(f"Error: Dataset not found at {data_path}")
print("Please make sure the CSV file is uploaded to your Codespace.")
return
print(f"Loading dataset from {data_path}...")
try:
df = pd.read_csv(data_path)
except Exception as e:
print(f"Error loading CSV: {e}")
return
# --- Data Validation ---
email_body_column = 'body' # Column name for email text in your CSV
category_column = 'category' # Column name for the category label in your CSV
if email_body_column not in df.columns:
print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
print(f"Available columns: {df.columns.tolist()}")
return
if category_column not in df.columns:
print(f"Error: Category column '{category_column}' not found in the dataset.")
print(f"Available columns: {df.columns.tolist()}")
return
# Handle potential missing values
df.dropna(subset=[email_body_column, category_column], inplace=True)
if df.empty:
print("Error: No valid data remaining after handling missing values.")
return
print("Applying text cleaning...")
# Ensure the cleaning function exists and works
try:
df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
except Exception as e:
print(f"Error during text cleaning: {e}")
return
print("Splitting data...")
X = df['cleaned_text']
y = df[category_column]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
)
# --- Model Pipeline ---
pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
('clf', MultinomialNB()) # Using Naive Bayes as a starting point
])
print("Training model...")
try:
pipeline.fit(X_train, y_train)
print("Training complete.")
except Exception as e:
print(f"Error during model training: {e}")
return
# --- Evaluation ---
try:
accuracy = pipeline.score(X_test, y_test)
print(f"Model Accuracy on Test Set: {accuracy:.4f}")
except Exception as e:
print(f"Error during model evaluation: {e}")
# --- Save Model ---
print(f"Saving model pipeline to {model_save_path}...")
model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
try:
joblib.dump(pipeline, model_save_path)
print("Model pipeline saved successfully.")
except Exception as e:
print(f"Error saving model pipeline: {e}")
# --- API Endpoints ---
@app.get("/")
def read_root():
return {"message": "Email Classification API is running. Use the /classify/ endpoint."}
@app.post("/classify/", response_model=ClassificationOutput)
async def classify_email(email_input: EmailInput):
if model_pipeline is None:
raise HTTPException(status_code=503, detail="Model not loaded. API is not ready.")
input_email = email_input.email_body
# 1. Mask PII
masked_text, masked_entities_list = mask_pii(input_email)
# Convert masked_entities_list to list of MaskedEntity objects if needed
# (Depends on how mask_pii returns it, ensure structure matches Pydantic model)
formatted_entities = [MaskedEntity(**entity) for entity in masked_entities_list]
# 2. Predict Category using the masked text
predicted_category = predict_category(masked_text, model_pipeline)
# 3. Construct and return the response
response = ClassificationOutput(
input_email_body=input_email,
list_of_masked_entities=formatted_entities,
masked_email=masked_text,
category_of_the_email=predicted_category
)
return response
# Example Usage (if you run this file directly for testing/training)
if __name__ == "__main__":
print("Running models.py directly...")
dummy_emails = [
"Subject: Billing Issue My account [full_name] was charged twice for order [order_id]. Please refund.",
"Subject: Help needed Cannot login. My email is [email]. Reset password link broken.",
"Subject: Account Management Request to close my account [account_num]. User [full_name]."
]
dummy_labels = ["Billing Issues", "Technical Support", "Account Management"]
print("Attempting to load model and predict...")
model_pipeline = load_model_pipeline()
if model_pipeline:
test_email = "my login is not working help required email [email]"
category = predict_category(test_email, model_pipeline)
print(f"Test Email: '{test_email}'")
print(f"Predicted Category: {category}")
else:
print("Cannot perform prediction as model pipeline failed to load.")
#hi i am siddharth
#hi i am siddharth |