Smart-inbox / key_info.py
imenayadi's picture
Gradio
5207833
raw
history blame
1.09 kB
import spacy
from transformers import pipeline
import re
from dateutil.parser import parse
# Regex pattern for dates
def extract_entities(email_text, nlp, ner_pipeline):
date_pattern = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:th|st|nd|rd)?,\s+\d{4}\b'
# Use spaCy for initial extraction
doc = nlp(email_text)
spacy_entities = [{"Text": ent.text, "Type": ent.label_} for ent in doc.ents]
# Use transformer model for refined extraction
transformer_entities = ner_pipeline(email_text)
transformer_entities = [{"Text": ent['word'], "Type": ent['entity'], "Score": ent['score']} for ent in transformer_entities if ent['score'] > 0.75]
# Extract dates using regex
potential_dates = re.findall(date_pattern, email_text)
dates = [parse(date).strftime('%Y-%m-%d') for date in potential_dates]
return {
"spaCy Entities": spacy_entities,
"Transformer Entities": transformer_entities,
"Dates": dates
}