Spaces:

siddharth786
/

email-pii-classifier-v2

Runtime error

File size: 8,398 Bytes

c2d8eab
 
b20f676
b2c11bb
 
b20f676
c2d8eab
b20f676
c2d8eab
b20f676
 
 
 
 
 
 
 
0897779
b20f676
b2c11bb
 
 
 
 
b20f676
 
b2c11bb
b20f676
b2c11bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b20f676
b2c11bb
 
 
 
 
 
 
 
 
 
 
 
 
 
c2d8eab
 
 
 
 
 
 
b2c11bb
 
 
c2d8eab
 
b2c11bb
c2d8eab
 
b20f676
 
c2d8eab
 
 
 
 
b20f676
c2d8eab
 
 
 
 
 
 
 
 
b2c11bb
c2d8eab
 
 
 
 
 
b2c11bb
 
c2d8eab
 
 
 
 
 
 
 
 
 
b2c11bb
 
 
c2d8eab
 
 
 
 
b2c11bb
c2d8eab
 
 
 
 
 
 
 
 
 
 
 
 
b2c11bb
c2d8eab
 
 
 
 
 
 
 
 
b20f676
b2c11bb

import re
import spacy
from typing import List, Dict, Tuple, Optional, Union
import pickle
from pathlib import Path
import os

# --- Define/Import Pipeline Type FIRST ---
try:
    from sklearn.pipeline import Pipeline
except ImportError:
    Pipeline = object  # type: ignore

# --- Import from models.py ---
try:
    from models import predict_category  # Keep this import
    print("Successfully imported predict_category from models.py")
except ImportError as e:
    print(f"ERROR in utils.py: Could not import predict_category from models.py. Details: {e}")
    def predict_category(text, pipeline): return "Classification failed"

# --- Model Loading ---
MODEL_DIR = Path("saved_models")
MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
NLP_MODEL: Optional[spacy.language.Language] = None
MODEL_PIPELINE: Optional[Pipeline] = None  # Now Pipeline is defined

def load_spacy_model() -> Optional[spacy.language.Language]:
    """Loads the spaCy model."""
    global NLP_MODEL
    if NLP_MODEL is None:
        try:
            NLP_MODEL = spacy.load("en_core_web_sm")
            print("spaCy model 'en_core_web_sm' loaded successfully.")
        except OSError:
            print("Error loading spaCy model 'en_core_web_sm'. Make sure it's downloaded.")
            # Attempt to download if not found (might fail in restricted envs)
            try:
                print("Attempting to download spaCy model...")
                spacy.cli.download("en_core_web_sm")
                NLP_MODEL = spacy.load("en_core_web_sm")
                print("spaCy model 'en_core_web_sm' downloaded and loaded successfully.")
            except Exception as download_e:
                print(f"Failed to download or load spaCy model: {download_e}")
                NLP_MODEL = None  # Ensure it remains None if loading fails
    return NLP_MODEL

def load_model_pipeline() -> Optional[Pipeline]:  # Now Pipeline is defined
    """Loads the classification pipeline from the .pkl file."""
    global MODEL_PIPELINE
    if MODEL_PIPELINE is None:
        if not MODEL_PATH.exists():
            print(f"Model pipeline not found at {MODEL_PATH}. Please train and save the model pipeline first.")
            return None
        try:
            with open(MODEL_PATH, "rb") as f:
                MODEL_PIPELINE = pickle.load(f)
            print("Model pipeline loaded successfully.")
        except Exception as e:
            print(f"Error loading model pipeline from {MODEL_PATH}: {e}")
            MODEL_PIPELINE = None  # Ensure it remains None if loading fails
    return MODEL_PIPELINE

# --- PII Detection Regex Patterns ---
# Define regex patterns for PII entities not easily caught by NER
# (Refine these patterns carefully for accuracy)
REGEX_PATTERNS = {
    "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    "phone_number": r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
    "credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b',  # Basic pattern, needs refinement
    "cvv_no": r'\b\d{3,4}\b',  # Often needs context to differentiate
    "expiry_no": r'\b(0[1-9]|1[0-2])\/?([0-9]{4}|[0-9]{2})\b',  # MM/YY or MM/YYYY
    "aadhar_num": r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b',
    # DOB might be harder with regex alone, consider context or NER patterns
    "dob": r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b'  # Basic DOB patterns
}

# --- PII Masking Function (Defined within utils.py) ---
def mask_pii(text: str, nlp: spacy.language.Language) -> Tuple[str, List[Dict]]:
    """
    Detects and masks PII in the input text using spaCy NER and Regex.

    Args:
        text: The input email body string.
        nlp: The spaCy language model.

    Returns:
        A tuple containing:
        - masked_email (str): The email body with PII replaced by placeholders.
        - list_of_masked_entities (List[Dict]): A list of dictionaries,
          each detailing a masked entity (position, classification, original value).
    """
    masked_text = text
    list_of_masked_entities = []
    found_spans = []  # To store (start, end, entity_type, original_value)

    # 1. Use spaCy for Named Entity Recognition (PERSON for full_name)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # Simple PERSON check, might need refinement (e.g., filter short names)
            if len(ent.text.split()) > 1:  # Basic check for multi-word names
                found_spans.append((ent.start_char, ent.end_char, "full_name", ent.text))

    # 2. Use Regex for other PII types
    for entity_type, pattern in REGEX_PATTERNS.items():
        for match in re.finditer(pattern, text):
            # Basic check for overlap with already found spans (can be improved)
            is_overlapping = any(
                max(found[0], match.start()) < min(found[1], match.end())
                for found in found_spans
            )
            if not is_overlapping:
                # Add basic context checks if needed (e.g., for CVV)
                # if entity_type == "cvv_no" and not is_likely_cvv(text, match): continue
                found_spans.append((match.start(), match.end(), entity_type, match.group(0)))

    # 3. Sort spans by start position to handle masking correctly
    found_spans.sort(key=lambda x: x[0])

    # 4. Perform masking and create the entity list
    offset = 0  # Keep track of index changes due to replacements
    for start, end, entity_type, original_value in found_spans:
        adjusted_start = start + offset
        adjusted_end = end + offset
        placeholder = f"[{entity_type}]"

        # Replace the PII with the placeholder in the masked_text
        masked_text = masked_text[:adjusted_start] + placeholder + masked_text[adjusted_end:]

        # Update the offset for subsequent replacements
        offset += len(placeholder) - (end - start)

        # Add details to the list_of_masked_entities
        list_of_masked_entities.append({
            "position": [start, end],  # Use ORIGINAL indices
            "classification": entity_type,
            "entity": original_value
        })

    # Sort the final list by original start position for consistency
    list_of_masked_entities.sort(key=lambda x: x["position"][0])

    return masked_text, list_of_masked_entities

# --- Main Processing Function (Defined within utils.py) ---
def process_email_request(email_body: str) -> dict:
    """
    Processes the input email body for PII masking and classification.
    Loads models on first call if not already loaded.
    """
    print("Processing email request...")  # Add log
    nlp = load_spacy_model()
    pipeline = load_model_pipeline()

    if nlp is None:
        return {"error": "spaCy model not loaded.", "input_email_body": email_body}
    if pipeline is None:
        return {"error": "Classification pipeline not loaded.", "input_email_body": email_body}

    try:
        # 1. Mask PII using the loaded spaCy model
        # Ensure mask_pii expects the nlp model as an argument if needed
        masked_email_body, entities = mask_pii(email_body, nlp)  # Pass nlp model
        print(f"PII Masking complete. Found {len(entities)} entities.")  # Add log

        # Convert entities to the required dict format if necessary
        # Assuming mask_pii already returns entities as list of dicts
        # with 'position', 'classification', 'entity' keys.

        # 2. Classify the masked email using the loaded pipeline
        predicted_class = predict_category(masked_email_body, pipeline)
        print(f"Classification complete. Predicted class: {predicted_class}")  # Add log

        # 3. Construct the response dictionary
        response = {
            "input_email_body": email_body,
            "list_of_masked_entities": entities,  # Ensure this matches expected format
            "masked_email": masked_email_body,
            "category_of_the_email": predicted_class
        }
        print("Response constructed successfully.")  # Add log
        return response

    except Exception as e:
        print(f"Error during email processing: {e}")  # Log the specific error
        # Consider logging the full traceback for debugging
        # import traceback
        # print(traceback.format_exc())
        return {
            "error": f"An error occurred during processing: {str(e)}",
            "input_email_body": email_body
        }