Spaces:

siddharth786
/

email-pii-classifier-v2

Runtime error

App Files Files Community

email-pii-classifier-v2 / utils.py

siddharth786

Fix circular import and NameError issues

b20f676 2 months ago

raw

history blame contribute delete

8.4 kB

	import re
	import spacy
	from typing import List, Dict, Tuple, Optional, Union
	import pickle
	from pathlib import Path
	import os

	# --- Define/Import Pipeline Type FIRST ---
	try:
	from sklearn.pipeline import Pipeline
	except ImportError:
	Pipeline = object # type: ignore

	# --- Import from models.py ---
	try:
	from models import predict_category # Keep this import
	print("Successfully imported predict_category from models.py")
	except ImportError as e:
	print(f"ERROR in utils.py: Could not import predict_category from models.py. Details: {e}")
	def predict_category(text, pipeline): return "Classification failed"

	# --- Model Loading ---
	MODEL_DIR = Path("saved_models")
	MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
	NLP_MODEL: Optional[spacy.language.Language] = None
	MODEL_PIPELINE: Optional[Pipeline] = None # Now Pipeline is defined

	def load_spacy_model() -> Optional[spacy.language.Language]:
	"""Loads the spaCy model."""
	global NLP_MODEL
	if NLP_MODEL is None:
	try:
	NLP_MODEL = spacy.load("en_core_web_sm")
	print("spaCy model 'en_core_web_sm' loaded successfully.")
	except OSError:
	print("Error loading spaCy model 'en_core_web_sm'. Make sure it's downloaded.")
	# Attempt to download if not found (might fail in restricted envs)
	try:
	print("Attempting to download spaCy model...")
	spacy.cli.download("en_core_web_sm")
	NLP_MODEL = spacy.load("en_core_web_sm")
	print("spaCy model 'en_core_web_sm' downloaded and loaded successfully.")
	except Exception as download_e:
	print(f"Failed to download or load spaCy model: {download_e}")
	NLP_MODEL = None # Ensure it remains None if loading fails
	return NLP_MODEL

	def load_model_pipeline() -> Optional[Pipeline]: # Now Pipeline is defined
	"""Loads the classification pipeline from the .pkl file."""
	global MODEL_PIPELINE
	if MODEL_PIPELINE is None:
	if not MODEL_PATH.exists():
	print(f"Model pipeline not found at {MODEL_PATH}. Please train and save the model pipeline first.")
	return None
	try:
	with open(MODEL_PATH, "rb") as f:
	MODEL_PIPELINE = pickle.load(f)
	print("Model pipeline loaded successfully.")
	except Exception as e:
	print(f"Error loading model pipeline from {MODEL_PATH}: {e}")
	MODEL_PIPELINE = None # Ensure it remains None if loading fails
	return MODEL_PIPELINE

	# --- PII Detection Regex Patterns ---
	# Define regex patterns for PII entities not easily caught by NER
	# (Refine these patterns carefully for accuracy)
	REGEX_PATTERNS = {
	"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b',
	"phone_number": r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
	"credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b', # Basic pattern, needs refinement
	"cvv_no": r'\b\d{3,4}\b', # Often needs context to differentiate
	"expiry_no": r'\b(0[1-9]\|1[0-2])\/?([0-9]{4}\|[0-9]{2})\b', # MM/YY or MM/YYYY
	"aadhar_num": r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b',
	# DOB might be harder with regex alone, consider context or NER patterns
	"dob": r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b' # Basic DOB patterns
	}

	# --- PII Masking Function (Defined within utils.py) ---
	def mask_pii(text: str, nlp: spacy.language.Language) -> Tuple[str, List[Dict]]:
	"""
	Detects and masks PII in the input text using spaCy NER and Regex.

	Args:
	text: The input email body string.
	nlp: The spaCy language model.

	Returns:
	A tuple containing:
	- masked_email (str): The email body with PII replaced by placeholders.
	- list_of_masked_entities (List[Dict]): A list of dictionaries,
	each detailing a masked entity (position, classification, original value).
	"""
	masked_text = text
	list_of_masked_entities = []
	found_spans = [] # To store (start, end, entity_type, original_value)

	# 1. Use spaCy for Named Entity Recognition (PERSON for full_name)
	doc = nlp(text)
	for ent in doc.ents:
	if ent.label_ == "PERSON":
	# Simple PERSON check, might need refinement (e.g., filter short names)
	if len(ent.text.split()) > 1: # Basic check for multi-word names
	found_spans.append((ent.start_char, ent.end_char, "full_name", ent.text))

	# 2. Use Regex for other PII types
	for entity_type, pattern in REGEX_PATTERNS.items():
	for match in re.finditer(pattern, text):
	# Basic check for overlap with already found spans (can be improved)
	is_overlapping = any(
	max(found[0], match.start()) < min(found[1], match.end())
	for found in found_spans
	)
	if not is_overlapping:
	# Add basic context checks if needed (e.g., for CVV)
	# if entity_type == "cvv_no" and not is_likely_cvv(text, match): continue
	found_spans.append((match.start(), match.end(), entity_type, match.group(0)))

	# 3. Sort spans by start position to handle masking correctly
	found_spans.sort(key=lambda x: x[0])

	# 4. Perform masking and create the entity list
	offset = 0 # Keep track of index changes due to replacements
	for start, end, entity_type, original_value in found_spans:
	adjusted_start = start + offset
	adjusted_end = end + offset
	placeholder = f"[{entity_type}]"

	# Replace the PII with the placeholder in the masked_text
	masked_text = masked_text[:adjusted_start] + placeholder + masked_text[adjusted_end:]

	# Update the offset for subsequent replacements
	offset += len(placeholder) - (end - start)

	# Add details to the list_of_masked_entities
	list_of_masked_entities.append({
	"position": [start, end], # Use ORIGINAL indices
	"classification": entity_type,
	"entity": original_value
	})

	# Sort the final list by original start position for consistency
	list_of_masked_entities.sort(key=lambda x: x["position"][0])

	return masked_text, list_of_masked_entities

	# --- Main Processing Function (Defined within utils.py) ---
	def process_email_request(email_body: str) -> dict:
	"""
	Processes the input email body for PII masking and classification.
	Loads models on first call if not already loaded.
	"""
	print("Processing email request...") # Add log
	nlp = load_spacy_model()
	pipeline = load_model_pipeline()

	if nlp is None:
	return {"error": "spaCy model not loaded.", "input_email_body": email_body}
	if pipeline is None:
	return {"error": "Classification pipeline not loaded.", "input_email_body": email_body}

	try:
	# 1. Mask PII using the loaded spaCy model
	# Ensure mask_pii expects the nlp model as an argument if needed
	masked_email_body, entities = mask_pii(email_body, nlp) # Pass nlp model
	print(f"PII Masking complete. Found {len(entities)} entities.") # Add log

	# Convert entities to the required dict format if necessary
	# Assuming mask_pii already returns entities as list of dicts
	# with 'position', 'classification', 'entity' keys.

	# 2. Classify the masked email using the loaded pipeline
	predicted_class = predict_category(masked_email_body, pipeline)
	print(f"Classification complete. Predicted class: {predicted_class}") # Add log

	# 3. Construct the response dictionary
	response = {
	"input_email_body": email_body,
	"list_of_masked_entities": entities, # Ensure this matches expected format
	"masked_email": masked_email_body,
	"category_of_the_email": predicted_class
	}
	print("Response constructed successfully.") # Add log
	return response

	except Exception as e:
	print(f"Error during email processing: {e}") # Log the specific error
	# Consider logging the full traceback for debugging
	# import traceback
	# print(traceback.format_exc())
	return {
	"error": f"An error occurred during processing: {str(e)}",
	"input_email_body": email_body
	}