File size: 5,167 Bytes
e51f125
 
 
5bd749f
 
e51f125
79e12fd
 
 
 
 
 
 
 
 
 
 
3c51821
c564c57
3c51821
79e12fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0acdafb
79e12fd
 
0acdafb
79e12fd
 
 
 
 
 
 
 
0acdafb
79e12fd
 
 
 
 
 
 
 
c564c57
79e12fd
 
 
 
 
 
3c51821
 
 
 
79e12fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba8d0da
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==2.7.5.2")
os.system("pip install typing-extensions --upgrade")


import logging, regex
import gradio
from email_parser import utils, nlp
from email_parser.doc_email import Email

def print_highlighted_text(text, df_result, offset=0):
    iter_match = regex.finditer("\s|$", text)
    start_pos = 0
    list_values = []
    for match in iter_match:
        word = match.string[start_pos:match.start()]
        logging.debug(f"word '{word}' was found between {start_pos} and {match.start()}")
        df_entity = df_result.query(f"{start_pos + offset}>=start & {start_pos + offset}<=end").head(1)
        logging.debug(f"Found entites are: {df_entity}")
        if len(df_entity) == 1:
            entity = df_entity["entity"].values[0]
        else:
            entity = None
        list_values.append((word, entity))
        # list_values.append((match.string[match.start():match.end()], None))
        start_pos = match.end()
    return list_values


def display_email(text, part=1):
    doc = Email(text)
    list_emails = doc.list_emails
    if part <= len(list_emails):
        text = list_emails[int(part-1)]["body"]
        header = list_emails[int(part-1)]["header"]
        lang = nlp.f_detect_language(text)

        if len(header)>0:
            df_results_header = nlp.f_ner(header, lang=lang)
            df_results_header = Email.f_find_person_in_header(header, df_result=df_results_header)
            list_words_headers = print_highlighted_text(header, df_results_header)
        else:
            list_words_headers = None

        df_result = nlp.f_ner(text, lang=lang)
        logging.debug(f"NER results for text '{text}' are: {df_result}")
        df_signature = nlp.f_detect_email_signature(text, df_ner=df_result)
        if df_signature is not None and len(df_signature) > 0:
            start_signature_position = df_signature["start"].values[0]
            text_body = text[:start_signature_position]
            text_signature = text[start_signature_position:]
            list_words_signature = print_highlighted_text(text_signature, df_result, offset=start_signature_position)
        else:
            text_body = text
            list_words_signature = None
        list_words_body = print_highlighted_text(text_body, df_result)

        return None, lang, list_words_headers, list_words_body, list_words_signature
    else:
        return f"Email number {int(part)} was requested but only {len(list_emails)} emails was found in this thread", \
               None, None, None, None


utils.f_setup_logger(level_sysout=logging.INFO, level_file=logging.INFO, folder_path="logs")


iface = gradio.Interface(title="Parser of email",
                         description="Small application that can extract a specific email in a thread of email,"
                                     " highlights the entities found in the text (person, organization, date,...)"
                                     " and extract email signature if any.",
                         article="*The model used to detect signature is described in detail here: "
                                 "<a href=\"https://medium.com/@jean-baptiste.polle/lstm-model-for-email-signature-detection-8e990384fefa\">"
                                 "https://medium.com/@jean-baptiste.polle/lstm-model-for-email-signature-detection-8e990384fefa"
                                 "</a>",
                         fn=display_email,
                         inputs=["textbox",
                             gradio.inputs.Number(default=1, label="Email number in thread")],
                         outputs=[
                              gradio.outputs.Textbox(type="str", label="Error"),
                              gradio.outputs.Textbox(type="str", label="Language"),
                              gradio.outputs.HighlightedText(label="Header"),
                              gradio.outputs.HighlightedText(label="Body"),
                              gradio.outputs.HighlightedText(label="Signature")],
                        examples=[["""Bonjour Vincent,
Merci de m’avoir rappelé hier.
Seriez vous disponible pour un rendez vous la semaine prochaine?
Merci,
Jean-Baptiste""", 1],  ["""Hello Jack,

I hope you had nice holiday as well.
Please find attached the requested documents,

Best Regards,
George
Vice president of Something
email: george@google.com
tel: 512-222-5555

On Mon, Jan 7, 2022 at 12:39 PM, Jack <jack@google.com> wrote:

Hello George,

I wish you a happy new year. I hope you had nice holidays.
Did you see Garry during your vacation?
Do you have the documents I requested earlier?

Thanks,
Jack


""", 1] ,  ["""Hello Jack,

I hope you had nice holiday as well.
Please find attached the requested documents,

Best Regards,
George
Vice president of Something
email: george@google.com
tel: 512-222-5555

On Mon, Jan 7, 2022 at 12:39 PM, Jack <jack@google.com> wrote:

Hello George,

I wish you a happy new year. I hope you had nice holidays.
Did you see Garry during your vacation?
Do you have the documents I requested earlier?

Thanks,
Jack


""", 2] ])


iface.launch()