email-parser / app.py
Nikhil Singh
email upload
d90af9d
raw
history blame
3.62 kB
import gradio as gr
from mailparser import parse_from_file
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(file_path):
email = parse_from_file(file_path)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
def get_model(model_name: str = None, multilingual: bool = False):
if model_name is None:
model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None, multilingual: bool = False) -> List[Dict[str, Union[str, list]]]:
model = get_model(model_name, multilingual=multilingual)
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
entities = [{"text": entity["text"], "label": entity["label"]} for entity in _entities]
results.extend(entities)
return results
def present(email_file, labels, multilingual=False):
email = accept_mail(email_file)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
# Format entities for DataFrame: Convert list of dicts to list of lists
entities_data = [[entity['text'], entity['label']] for entity in entities]
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Extracted Entities": entities_data # Adjusted for DataFrame
}
return [email_info[key] for key in ["Subject", "From", "To", "Date"]] + [entities_data]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.File(label="Upload Email (.eml file)"),
gr.components.CheckboxGroup(
choices=labels,
label="Labels to Detect",
value=labels, # Default all selected
),
gr.components.Checkbox(label="Use Multilingual Model")
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities")
],
title="Email Info Extractor",
description="Upload an email file (.eml) to extract its details and detected entities."
)
demo.launch(share=True)