Spaces:
Sleeping
Sleeping
import gradio as gr | |
from mailparser import parse_from_file | |
from bs4 import BeautifulSoup | |
from gliner import GLiNER | |
from typing import Dict, Union, List | |
import spacy | |
import re | |
import os | |
import en_core_web_sm | |
nlp = en_core_web_sm.load() | |
_MODEL = {} | |
_CACHE_DIR = os.environ.get("CACHE_DIR", None) | |
def accept_mail(file_path): | |
email = parse_from_file(file_path) | |
return email | |
def clean_email(email): | |
soup = BeautifulSoup(email.body, 'html.parser') | |
for tag in soup.find_all(['style', 'link']): | |
tag.decompose() | |
cleaned_text = ' '.join(soup.get_text(separator=' ').split()) | |
return cleaned_text | |
def remove_special_characters(text): | |
pattern = r'[=_-]+' | |
cleaned_text = re.sub(pattern, '', text) | |
return cleaned_text | |
def get_sentences(further_cleaned_text): | |
doc = nlp(further_cleaned_text) | |
sentences = [sent.text for sent in doc.sents] | |
return sentences | |
def get_model(model_name: str = None, multilingual: bool = False): | |
if model_name is None: | |
model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual" | |
global _MODEL | |
if _MODEL.get(model_name) is None: | |
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR) | |
return _MODEL[model_name] | |
def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None, multilingual: bool = False) -> List[Dict[str, Union[str, list]]]: | |
model = get_model(model_name, multilingual=multilingual) | |
results = [] | |
for sentence in sentences: | |
_entities = model.predict_entities(sentence, labels, threshold=threshold) | |
entities = [{"text": entity["text"], "label": entity["label"]} for entity in _entities] | |
results.extend(entities) | |
return results | |
def present(email_file, labels, multilingual=False): | |
email = accept_mail(email_file) | |
cleaned_text = clean_email(email) | |
further_cleaned_text = remove_special_characters(cleaned_text) | |
sentence_list = get_sentences(further_cleaned_text) | |
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual) | |
# Format entities for DataFrame: Convert list of dicts to list of lists | |
entities_data = [[entity['text'], entity['label']] for entity in entities] | |
email_info = { | |
"Subject": email.subject, | |
"From": email.from_, | |
"To": email.to, | |
"Date": email.date, | |
"Extracted Entities": entities_data # Adjusted for DataFrame | |
} | |
return [email_info[key] for key in ["Subject", "From", "To", "Date"]] + [entities_data] | |
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"] | |
demo = gr.Interface( | |
fn=present, | |
inputs=[ | |
gr.components.File(label="Upload Email (.eml file)"), | |
gr.components.CheckboxGroup( | |
choices=labels, | |
label="Labels to Detect", | |
value=labels, # Default all selected | |
), | |
gr.components.Checkbox(label="Use Multilingual Model") | |
], | |
outputs=[ | |
gr.components.Textbox(label="Subject"), | |
gr.components.Textbox(label="From"), | |
gr.components.Textbox(label="To"), | |
gr.components.Textbox(label="Date"), | |
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities") | |
], | |
title="Email Info Extractor", | |
description="Upload an email file (.eml) to extract its details and detected entities." | |
) | |
demo.launch(share=True) | |