import gradio as gr from mailparser import parse_from_string from bs4 import BeautifulSoup from gliner import GLiNER from typing import Dict, Union, List import spacy import re import os import en_core_web_sm nlp = en_core_web_sm.load() # nlp = spacy.load("en_core_web_sm") _MODEL = {} _CACHE_DIR = os.environ.get("CACHE_DIR", None) def accept_mail(email_content): email = parse_from_string(email_content) return email def clean_email(email): soup = BeautifulSoup(email.body, 'html.parser') for tag in soup.find_all(['style', 'link']): tag.decompose() cleaned_text = ' '.join(soup.get_text(separator=' ').split()) return cleaned_text def remove_special_characters(text): pattern = r'[=_-]+' cleaned_text = re.sub(pattern, '', text) return cleaned_text def get_sentences(further_cleaned_text): doc = nlp(further_cleaned_text) sentences = [sent.text for sent in doc.sents] return sentences # doc = nlp(text) # entities = [] # for ent in doc.ents: # if ent.label_ in labels: # entities.append((ent.text, ent.label_)) # return entities def get_model(model_name: str = None): if model_name is None: model_name = "urchade/gliner_base" global _MODEL if _MODEL.get(model_name) is None: _MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR) return _MODEL[model_name] def parse_query(sentences: List[str], labels: Union[str, list], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None) -> List[Dict[str, Union[str, list]]]: model = get_model(model_name) if isinstance(labels, str): labels = [i.strip() for i in labels.split(",")] results = [] for sentence in sentences: _entities = model.predict_entities(sentence, labels, threshold=threshold) entities = [] for entity in _entities: entities.append(entity) results.append({"sentence": sentence, "entities": entities}) return results def present(email_content, labels): email = accept_mail(email_content) cleaned_text = clean_email(email) further_cleaned_text = remove_special_characters(cleaned_text) sentence_list = get_sentences(further_cleaned_text) # entity_info = '\n'.join([f"{text}: {label}" for text, label in entities]) result = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base") email_info = { "Subject": email.subject, "From": email.from_, "To": email.to, "Date": email.date, "Cleaned Body": further_cleaned_text, "Extracted Entities": result } return [email_info[key] for key in email_info] labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"] demo = gr.Interface( fn=present, inputs=[ gr.components.Textbox(label="Email Content"), gr.components.CheckboxGroup(label="Labels to Detect", choices=labels, default=labels) ], outputs=[ gr.components.Textbox(label="Subject"), gr.components.Textbox(label="From"), gr.components.Textbox(label="To"), gr.components.Textbox(label="Date"), gr.components.Textbox(label="Cleaned Body"), gr.components.Textbox(label="Extracted Entities") ], title="Email Info", description="Enter the email content below to view its details and detected entities." ) demo.launch()