email-parser / app.py
Nikhil Singh
more fixes
adc4ff3
raw
history blame
3.58 kB
import gradio as gr
from mailparser import parse_from_string
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
# nlp = spacy.load("en_core_web_sm")
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(email_content):
email = parse_from_string(email_content)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
# doc = nlp(text)
# entities = []
# for ent in doc.ents:
# if ent.label_ in labels:
# entities.append((ent.text, ent.label_))
# return entities
def get_model(model_name: str = None):
if model_name is None:
model_name = "urchade/gliner_base"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences: List[str], labels: Union[str, list], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None) -> List[Dict[str, Union[str, list]]]:
model = get_model(model_name)
if isinstance(labels, str):
labels = [i.strip() for i in labels.split(",")]
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
entities = []
for entity in _entities:
entities.append(entity)
results.append({"sentence": sentence, "entities": entities})
return results
def present(email_content, labels):
email = accept_mail(email_content)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
# entity_info = '\n'.join([f"{text}: {label}" for text, label in entities])
result = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base")
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Cleaned Body": further_cleaned_text,
"Extracted Entities": result
}
return [email_info[key] for key in email_info]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER",
"ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY",
"MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.Textbox(label="Email Content"),
gr.components.CheckboxGroup(label="Labels to Detect", choices=labels, default=labels)
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Textbox(label="Cleaned Body"),
gr.components.Textbox(label="Extracted Entities")
],
title="Email Info",
description="Enter the email content below to view its details and detected entities."
)
demo.launch()