jeanpoll commited on
Commit
79e12fd
·
1 Parent(s): ba8d0da

first working version of app

Browse files
.gitignore ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Pycharm
117
+ .idea/
118
+
119
+ # Spyder project settings
120
+ .spyderproject
121
+ .spyproject
122
+
123
+ # Rope project settings
124
+ .ropeproject
125
+
126
+ # mkdocs documentation
127
+ /site
128
+
129
+ # mypy
130
+ .mypy_cache/
131
+ .dmypy.json
132
+ dmypy.json
133
+
134
+ # Pyre type checker
135
+ .pyre/
136
+
137
+ # pytype static type analyzer
138
+ .pytype/
139
+
140
+ # Cython debug symbols
141
+ cython_debug/
142
+
143
+ # additionnals stuff
144
+ logs/
Untitled.ipynb ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "spiritual-swift",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%config Completer.use_jedi = False\n",
11
+ "%load_ext autoreload\n",
12
+ "%autoreload 2"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 1,
18
+ "id": "stopped-single",
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "import tensorflow\n",
23
+ "import regex"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 2,
29
+ "id": "numeric-handle",
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "from transformers import pipeline"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "id": "numerous-overall",
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "from email_parser import nlp"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 4,
49
+ "id": "studied-oracle",
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "text = \"\"\"tel: 512 222 5555\"\"\""
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 5,
59
+ "id": "pacific-walter",
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "data": {
64
+ "text/plain": [
65
+ "'en'"
66
+ ]
67
+ },
68
+ "execution_count": 5,
69
+ "metadata": {},
70
+ "output_type": "execute_result"
71
+ }
72
+ ],
73
+ "source": [
74
+ "lang = nlp.f_detect_language(text)\n",
75
+ "lang"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 6,
81
+ "id": "every-gardening",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "text/html": [
87
+ "<div>\n",
88
+ "<style scoped>\n",
89
+ " .dataframe tbody tr th:only-of-type {\n",
90
+ " vertical-align: middle;\n",
91
+ " }\n",
92
+ "\n",
93
+ " .dataframe tbody tr th {\n",
94
+ " vertical-align: top;\n",
95
+ " }\n",
96
+ "\n",
97
+ " .dataframe thead th {\n",
98
+ " text-align: right;\n",
99
+ " }\n",
100
+ "</style>\n",
101
+ "<table border=\"1\" class=\"dataframe\">\n",
102
+ " <thead>\n",
103
+ " <tr style=\"text-align: right;\">\n",
104
+ " <th></th>\n",
105
+ " <th>entity</th>\n",
106
+ " <th>value</th>\n",
107
+ " <th>start</th>\n",
108
+ " <th>end</th>\n",
109
+ " <th>score</th>\n",
110
+ " </tr>\n",
111
+ " </thead>\n",
112
+ " <tbody>\n",
113
+ " <tr>\n",
114
+ " <th>0</th>\n",
115
+ " <td>TEL</td>\n",
116
+ " <td>512 222 5555</td>\n",
117
+ " <td>5</td>\n",
118
+ " <td>17</td>\n",
119
+ " <td>1</td>\n",
120
+ " </tr>\n",
121
+ " </tbody>\n",
122
+ "</table>\n",
123
+ "</div>"
124
+ ],
125
+ "text/plain": [
126
+ " entity value start end score\n",
127
+ "0 TEL 512 222 5555 5 17 1"
128
+ ]
129
+ },
130
+ "execution_count": 6,
131
+ "metadata": {},
132
+ "output_type": "execute_result"
133
+ }
134
+ ],
135
+ "source": [
136
+ "df_result = nlp.f_ner(text, lang=lang)\n",
137
+ "df_result"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
+ "id": "operating-recorder",
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": []
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 16,
151
+ "id": "delayed-overhead",
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "text/html": [
157
+ "<div>\n",
158
+ "<style scoped>\n",
159
+ " .dataframe tbody tr th:only-of-type {\n",
160
+ " vertical-align: middle;\n",
161
+ " }\n",
162
+ "\n",
163
+ " .dataframe tbody tr th {\n",
164
+ " vertical-align: top;\n",
165
+ " }\n",
166
+ "\n",
167
+ " .dataframe thead th {\n",
168
+ " text-align: right;\n",
169
+ " }\n",
170
+ "</style>\n",
171
+ "<table border=\"1\" class=\"dataframe\">\n",
172
+ " <thead>\n",
173
+ " <tr style=\"text-align: right;\">\n",
174
+ " <th></th>\n",
175
+ " <th>entity</th>\n",
176
+ " <th>value</th>\n",
177
+ " <th>start</th>\n",
178
+ " <th>end</th>\n",
179
+ " <th>score</th>\n",
180
+ " </tr>\n",
181
+ " </thead>\n",
182
+ " <tbody>\n",
183
+ " <tr>\n",
184
+ " <th>0</th>\n",
185
+ " <td>SIGNATURE</td>\n",
186
+ " <td>JB</td>\n",
187
+ " <td>119</td>\n",
188
+ " <td>122</td>\n",
189
+ " <td>0.955208</td>\n",
190
+ " </tr>\n",
191
+ " </tbody>\n",
192
+ "</table>\n",
193
+ "</div>"
194
+ ],
195
+ "text/plain": [
196
+ " entity value start end score\n",
197
+ "0 SIGNATURE JB 119 122 0.955208"
198
+ ]
199
+ },
200
+ "execution_count": 16,
201
+ "metadata": {},
202
+ "output_type": "execute_result"
203
+ }
204
+ ],
205
+ "source": [
206
+ "nlp.f_detect_email_signature(text, lang=\"fr\")"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 33,
212
+ "id": "frozen-jones",
213
+ "metadata": {},
214
+ "outputs": [
215
+ {
216
+ "data": {
217
+ "text/plain": [
218
+ "[('je', None), (\"m'appelle\", None), ('Jean-Baptiste', 'PER')]"
219
+ ]
220
+ },
221
+ "execution_count": 33,
222
+ "metadata": {},
223
+ "output_type": "execute_result"
224
+ }
225
+ ],
226
+ "source": [
227
+ "iter_match = regex.finditer(\"\\s|$\", text)\n",
228
+ "list_values = []\n",
229
+ "start_pos = 0\n",
230
+ "for match in iter_match:\n",
231
+ " word = match.string[start_pos:match.start()]\n",
232
+ " \n",
233
+ " df_entity = df_result.query(f\"start>={start_pos} & end<={match.start()}\").head(1)\n",
234
+ " if len(df_entity)==1:\n",
235
+ " entity = df_entity[\"entity\"].values[0]\n",
236
+ " else:\n",
237
+ " entity = None\n",
238
+ "# list_values\n",
239
+ " list_values.append((word, entity))\n",
240
+ " start_pos = match.end()\n",
241
+ "list_values\n",
242
+ " "
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": null,
248
+ "id": "solid-speaker",
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": []
252
+ }
253
+ ],
254
+ "metadata": {
255
+ "kernelspec": {
256
+ "display_name": "Python 3",
257
+ "language": "python",
258
+ "name": "python3"
259
+ },
260
+ "language_info": {
261
+ "codemirror_mode": {
262
+ "name": "ipython",
263
+ "version": 3
264
+ },
265
+ "file_extension": ".py",
266
+ "mimetype": "text/x-python",
267
+ "name": "python",
268
+ "nbconvert_exporter": "python",
269
+ "pygments_lexer": "ipython3",
270
+ "version": "3.7.10"
271
+ }
272
+ },
273
+ "nbformat": 4,
274
+ "nbformat_minor": 5
275
+ }
app.py CHANGED
@@ -1,7 +1,126 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  iface.launch()
 
1
+ import logging, regex
2
+ import gradio
3
+ from email_parser import utils, nlp
4
+ from email_parser.doc_email import Email
5
+
6
+ def print_highlighted_text(text, df_result, offset=0):
7
+ iter_match = regex.finditer("\s|$", text)
8
+ start_pos = 0
9
+ list_values = []
10
+ for match in iter_match:
11
+ word = match.string[start_pos:match.start()]
12
+
13
+ df_entity = df_result.query(f"{start_pos + offset}>=start & {match.start() + offset}<=end").head(1)
14
+ if len(df_entity) == 1:
15
+ entity = df_entity["entity"].values[0]
16
+ else:
17
+ entity = None
18
+ list_values.append((word, entity))
19
+ # list_values.append((match.string[match.start():match.end()], None))
20
+ start_pos = match.end()
21
+ return list_values
22
+
23
+
24
+ def display_email(text, part=1):
25
+ doc = Email(text)
26
+ list_emails = doc.list_emails
27
+ if part <= len(list_emails):
28
+ text = list_emails[int(part-1)]["body"]
29
+ header = list_emails[int(part-1)]["header"]
30
+ lang = nlp.f_detect_language(text)
31
+
32
+ if len(header)>0:
33
+ df_results_header = nlp.f_ner(header, lang=lang)
34
+ df_results_header = Email.f_find_person_in_header(header, df_result=df_results_header)
35
+ list_words_headers = print_highlighted_text(header, df_results_header)
36
+ else:
37
+ list_words_headers = []
38
+
39
+ df_result = nlp.f_ner(text, lang=lang)
40
+ df_signature = nlp.f_detect_email_signature(text, df_ner=df_result)
41
+ if df_signature is not None and len(df_signature) > 0:
42
+ start_signature_position = df_signature["start"].values[0]
43
+ text_body = text[:start_signature_position]
44
+ text_signature = text[start_signature_position:]
45
+ list_words_signature = print_highlighted_text(text_signature, df_result, offset=start_signature_position)
46
+ else:
47
+ text_body = text
48
+ list_words_signature = []
49
+ list_words_body = print_highlighted_text(text_body, df_result)
50
+
51
+ return None, lang, list_words_headers, list_words_body, list_words_signature
52
+ else:
53
+ return f"Email number {int(part)} was requested but only {len(list_emails)} emails was found in this thread", \
54
+ None, None, None, None
55
+
56
+
57
+ utils.f_setup_logger(level_sysout=logging.ERROR, level_file=logging.INFO, folder_path="logs")
58
+
59
+
60
+ iface = gradio.Interface(title="Parser of email",
61
+ description="Small application that can extract a specific email in a thread of email,"
62
+ " highlights the entities found in the text (person, organization, date,...)"
63
+ " and extract email signature if any.",
64
+ fn=display_email,
65
+ inputs=["textbox",
66
+ gradio.inputs.Number(default=1, label="Email number in thread")],
67
+ outputs=[
68
+ gradio.outputs.Textbox(type="str", label="Error"),
69
+ gradio.outputs.Textbox(type="str", label="Language"),
70
+ gradio.outputs.HighlightedText(label="Header"),
71
+ gradio.outputs.HighlightedText(label="Body"),
72
+ gradio.outputs.HighlightedText(label="Signature")],
73
+ examples=[["""Bonjour Vincent,
74
+ Merci de m’avoir rappelé hier.
75
+ Seriez vous disponible pour un rendez vous la semaine prochaine?
76
+ Merci,
77
+ Jean-Baptiste""", 1], ["""Hello Jack,
78
+
79
+ I hope you had nice holiday as well.
80
+ Please find attached the requested documents,
81
+
82
+ Best Regards,
83
+ George
84
+ Vice president of Something
85
+ email: george@google.com
86
+ tel: 512-222-5555
87
+
88
+ On Mon, Jan 7, 2022 at 12:39 PM, Jack <jack@google.com> wrote:
89
+
90
+ Hello George,
91
+
92
+ I wish you a happy new year. I hope you had nice holidays.
93
+ Did you see Garry during your vacation?
94
+ Do you have the documents I requested earlier?
95
+
96
+ Thanks,
97
+ Jack
98
+
99
+
100
+ """, 1] , ["""Hello Jack,
101
+
102
+ I hope you had nice holiday as well.
103
+ Please find attached the requested documents,
104
+
105
+ Best Regards,
106
+ George
107
+ Vice president of Something
108
+ email: george@google.com
109
+ tel: 512-222-5555
110
+
111
+ On Mon, Jan 7, 2022 at 12:39 PM, Jack <jack@google.com> wrote:
112
+
113
+ Hello George,
114
+
115
+ I wish you a happy new year. I hope you had nice holidays.
116
+ Did you see Garry during your vacation?
117
+ Do you have the documents I requested earlier?
118
+
119
+ Thanks,
120
+ Jack
121
+
122
+
123
+ """, 2] ])
124
 
 
 
125
 
 
126
  iface.launch()
email_parser/__init__.py ADDED
File without changes
email_parser/_models_signatures.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pandas as pd
3
+ import numpy as np
4
+ import regex
5
+ import os
6
+ import configparser
7
+ from sentence_transformers import SentenceTransformer
8
+ from scipy.spatial import distance
9
+ from keras.preprocessing.sequence import pad_sequences
10
+ from sklearn.preprocessing import StandardScaler
11
+ from sklearn.preprocessing import MinMaxScaler
12
+
13
+ from tensorflow import keras
14
+ import pickle
15
+
16
+ from . import nlp, utils
17
+
18
+ config = configparser.ConfigParser()
19
+ config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
20
+
21
+
22
+
23
+ model_name = config["DEFAULT"]["name_model_signature"]
24
+
25
+ model = keras.models.load_model(filepath=utils.get_model_full_path(model_name))
26
+ minmax_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/minmax_scaler.p"), "rb"))
27
+ standard_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/standard_scaler.p"), "rb"))
28
+
29
+
30
+ list_name_columns_features = ["line_number",
31
+ "text",
32
+ "start",
33
+ "end",
34
+ "PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB",
35
+ "SIGNATURE",
36
+ "word_count",
37
+ "inv_distance_to_merci",
38
+ "inv_distance_to_cordlt",
39
+ "inv_distance_to_regards",
40
+ "inv_distance_to_sincerely",
41
+ "inv_distance_to_sent_from",
42
+ "start_with_ps", "position_line",
43
+ "special_characters_count", "empty_chars_with_prev_line"]
44
+
45
+ list_columns_used_in_model = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL",
46
+ # "WEB",
47
+ "word_count",
48
+ "inv_distance_to_merci",
49
+ "inv_distance_to_cordlt",
50
+ # "inv_distance_to_regards",
51
+ "inv_distance_to_sincerely",
52
+ "inv_distance_to_sent_from",
53
+ "start_with_ps",
54
+ "position_line",
55
+ "special_characters_count",
56
+ "empty_chars_with_prev_line"]
57
+
58
+ columns_to_scale_minmax = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "position_line",
59
+ "empty_chars_with_prev_line",
60
+ "inv_distance_to_merci",
61
+ "inv_distance_to_cordlt",
62
+ "inv_distance_to_regards",
63
+ "inv_distance_to_sincerely",
64
+ "inv_distance_to_sent_from",
65
+ "start_with_ps"
66
+ ]
67
+
68
+ columns_to_scale_standard = ["word_count", "special_characters_count"]
69
+
70
+ def f_retrieve_entities_for_line(df_ner, start=0, end=1e12):
71
+ """Retrieve all entities in the previously computed dataframe for a specific line
72
+
73
+ Args:
74
+ df_ner: dataframe containing found entities
75
+ start: start position of the line in original text
76
+ end: end position of the line in original text
77
+
78
+ """
79
+
80
+ if len(df_ner) > 0:
81
+ df = df_ner.query(f"""(start>= {start} and end <= {end}) or (start<={start} and end>={end})""")
82
+ return df
83
+
84
+ embedder_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
85
+
86
+ def f_create_embedding_inv_dist_feature(text1, text2):
87
+ """ Computing distance between two texts based on their embedding
88
+ provided by the SentenceTransformer above"""
89
+ embedding_merci = embedder_model.encode(text1)
90
+ embedding_line = embedder_model.encode(text2)
91
+ dist = distance.cosine(embedding_merci, embedding_line)
92
+ return 1 / (dist + 0.01)
93
+
94
+
95
+ def f_create_email_lines_features(text, df_ner=None, position_offset=0):
96
+ list_lines = nlp.f_split_text_by_lines(text, position_offset)
97
+ list_features_vectors = []
98
+ if df_ner is None:
99
+ df_ner = nlp.f_ner(text)
100
+
101
+ for line_number in range(0, len(list_lines)):
102
+ list_features_vectors.append(f_create_line_features(list_lines, line_number, df_ner))
103
+
104
+ df_features = pd.DataFrame(list_features_vectors, columns=list_name_columns_features)
105
+
106
+ return df_features
107
+
108
+
109
+
110
+ def f_create_line_features(list_lines, line_number, df_ner):
111
+ current_line = list_lines[line_number]
112
+ total_lines = len(list_lines)
113
+ features_vector = [line_number, current_line[2], current_line[0], current_line[1]]
114
+ logging.debug(f"Creating line features for {current_line}")
115
+ df_ner_line = f_retrieve_entities_for_line(df_ner=df_ner, start=current_line[0], end=current_line[1])
116
+
117
+ # Adding entity to feature vector
118
+ for entity in ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "SIGNATURE"]:
119
+ value = len(df_ner_line.query(f"entity=='{entity}'")) if df_ner_line is not None else 0
120
+ features_vector.append(value)
121
+ # Adding word count
122
+ features_vector.append(len(current_line[2].split()))
123
+ # distance to greeting word "merci"
124
+ features_vector.append(f_create_embedding_inv_dist_feature("merci", current_line[2].lower()))
125
+
126
+ # distance to greeting word "merci"
127
+ features_vector.append(f_create_embedding_inv_dist_feature("cordialement", current_line[2].lower()))
128
+
129
+ # distance to greeting word "regards"
130
+ features_vector.append(f_create_embedding_inv_dist_feature("regards", current_line[2].lower()))
131
+
132
+ # distance to greeting word "regards"
133
+ features_vector.append(f_create_embedding_inv_dist_feature("sincerely", current_line[2].lower()))
134
+
135
+ # distance to word "sent from"
136
+ features_vector.append(f_create_embedding_inv_dist_feature("sent from", current_line[2].lower()))
137
+
138
+ # Line start with ps:
139
+ features_vector.append(regex.match(r"\s*ps *:", current_line[2], flags=regex.IGNORECASE ) is not None)
140
+
141
+ # Adding position line in email
142
+ position_in_email = (line_number + 1) / total_lines
143
+ features_vector.append(position_in_email)
144
+ # Adding special character count
145
+ special_char_count = len(regex.findall(r"[^\p{L}0-9 .,\n]", current_line[2]))
146
+ features_vector.append(special_char_count)
147
+ # Number of empty chars with previous line
148
+ empty_chars_with_prev_line = 0 if line_number == 0 else current_line[0] - list_lines[line_number - 1][1]
149
+ features_vector.append(empty_chars_with_prev_line)
150
+ return features_vector
151
+
152
+
153
+ def generate_x_y(df, minmax_scaler=None, standard_scaler=None, n_last_lines_to_keep=30,
154
+ list_columns=list_columns_used_in_model):
155
+ df, minmax_scaler, standard_scaler = f_scale_parameters(df, minmax_scaler, standard_scaler)
156
+ x = df[list_columns].to_numpy()[-n_last_lines_to_keep:, :]
157
+ x = np.expand_dims(x, axis=0)
158
+ y = df["is_signature"].to_numpy()[-n_last_lines_to_keep:]
159
+ y = np.expand_dims(y, axis=0)
160
+ return x, y, minmax_scaler, standard_scaler
161
+
162
+
163
+ def f_scale_parameters(df_tagged_data, minmax_scaler=None, standard_scaler=None):
164
+ # df_tagged_data = df_tagged_data.copy(deep=True)
165
+ if minmax_scaler is None:
166
+ logging.debug("fitting new min max scaller")
167
+ minmax_scaler = MinMaxScaler()
168
+ df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.fit_transform(
169
+ df_tagged_data[columns_to_scale_minmax])
170
+ else:
171
+ logging.debug("using already fitted minmax scaler")
172
+ df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.transform(
173
+ df_tagged_data[columns_to_scale_minmax])
174
+
175
+ if standard_scaler is None:
176
+ logging.debug("fitting new standard scaler")
177
+ standard_scaler = StandardScaler()
178
+ df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.fit_transform(
179
+ df_tagged_data[columns_to_scale_standard])
180
+ else:
181
+ logging.debug("using already fitted scaler")
182
+ df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.transform(
183
+ df_tagged_data[columns_to_scale_standard])
184
+ return df_tagged_data, minmax_scaler, standard_scaler
email_parser/config.ini ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [DEFAULT]
2
+ ner_model_fr = Jean-Baptiste/camembert-ner-with-dates
3
+ ner_model_en = Jean-Baptiste/roberta-large-ner-english
4
+ device = -1
5
+ default_lang = en
6
+ name_model_signature = model_signature_lstm_v10
7
+ path_models = models
email_parser/doc_email.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex
2
+ import pandas as pd
3
+
4
+ from . import nlp
5
+
6
+
7
+
8
+ class Email:
9
+
10
+ def __init__(self,
11
+ raw_text):
12
+ """ Constructor for email
13
+ :param raw_text: raw text of email
14
+ """
15
+ self.raw_text = raw_text
16
+ self.list_emails = self.f_split_email_thread()
17
+
18
+ def f_split_email_thread(self):
19
+ """ Function to split a thread of email into a list of individual email.
20
+
21
+ Two main formats of header are recognized:
22
+
23
+ 1) Multi-lines header similar to
24
+ De : sads Cadsfdsf [mailto:sdadsad@google.ca]
25
+ Envoyé : 30 mars 2015 08:33
26
+ À : asdsad, sadsadasd (CA - asdasd)
27
+ Objet : Re: TR: sadasdasdsad sa dsa
28
+ 2) Le 2015-03-30 à 08:25, Luc, Archambault (CA - Drummondville) <larchambault@google.ca> a écrit :
29
+
30
+ Returns:
31
+ list of dict. Dict contains for each email: (body, header, start, start_header, date, lang)
32
+
33
+ """
34
+
35
+ pattern = r"(((\n{1}\t*|\n(-{4,}.*-{4,}\s*)|^)(([> *]*(de|from|Exp.diteur|Subject)[\s]*:).*(\n[^A-Z].*)?[\r\n\t\s,]{1,}){1,})(([> *\t]*[\p{L}\p{M}' -]*[\s]*:).*((\n[ ]{3,7}?.*|(\n<.*))*)[\r\n\t\s,]{1,3}?){2,}" \
36
+ r"|(\s*((((de|from|Exp.diteur|Subject)[\s]*:).{0,200}?[\r\n\t\s,]{1,}){1})(?!de)(((envoy.|.|to|date).?[\s]*:).*?){1,}(((objet|subject)[\s]*:).*?[!?.><,]){1})" \
37
+ r"|((?<=\n)(([ >\t]*)(le|on|el).{0,30}\d{4,}.{0,100}\n*.{0,100}(wrote|.crit|escribió)\s*:))" \
38
+ r"|(\b(le|on)\s*((\d{2,4}[- ]){3}|(\d{1,2}.{1,8}\d{4}))[^\n]*?(wrote|.crit)\s*:)" \
39
+ r"|$)"
40
+
41
+ results = regex.finditer(pattern, self.raw_text, flags=regex.IGNORECASE)
42
+ start_of_current_header = 0
43
+ end_of_current_header = 0
44
+ part_email = 1
45
+
46
+ if results is not None:
47
+ list_email = []
48
+
49
+ for result in results:
50
+
51
+ start_of_next_header = result.start()
52
+
53
+ # if header_group is not None and full_email[0:header_group.start()].lstrip() == "":
54
+ if start_of_current_header != end_of_current_header:
55
+ header = self.raw_text[start_of_current_header: end_of_current_header]
56
+ body = self.raw_text[end_of_current_header:start_of_next_header]
57
+
58
+ start = end_of_current_header
59
+ start_header = start_of_current_header
60
+
61
+ # Case where no header was found (either last email of thread or regex didn't find it)
62
+ else:
63
+ header = ""
64
+ body = self.raw_text[end_of_current_header:start_of_next_header]
65
+ start = end_of_current_header
66
+ start_header = start_of_current_header
67
+
68
+
69
+ # we detect language for each email of the thread and default to detected thread language otherwise
70
+ # We detect only on first 150 characters
71
+ lang = nlp.f_detect_language(body[:150])
72
+
73
+ if body.strip() != "" or header != "":
74
+ list_email.append({"body": body,
75
+ "header": header,
76
+ "start": start,
77
+ "start_header": start_header,
78
+ "lang": lang,
79
+ "part": part_email
80
+ })
81
+ part_email += 1
82
+ # previous_from_tag = current_from_tag
83
+ start_of_current_header = result.start()
84
+ end_of_current_header = result.end()
85
+
86
+ return list_email
87
+ # Case were mail is not a thread
88
+ else:
89
+ return [{"body": self.raw_text,
90
+ "header": "",
91
+ "start": 0}]
92
+
93
+ @staticmethod
94
+ def f_find_person_in_header(header, df_result=pd.DataFrame()):
95
+ results = []
96
+ dict_header = Email.f_split_email_headers(header)
97
+ for key in ["to", "cc", "from"]:
98
+ if key in dict_header.keys():
99
+ line_header = dict_header[key][0]
100
+ start_posit = dict_header[key][1]
101
+ pattern_person = r"(?<=\s|'|^)[\p{L}\p{M}\s,-]{2,}(?=[\s;']|$)"
102
+ list_results = regex.finditer(pattern_person, line_header, flags=regex.IGNORECASE)
103
+ for match in list_results:
104
+ value = match.group()
105
+ if value.strip() != "":
106
+ start = match.start()
107
+ end = match.end()
108
+ results.append(["PER",
109
+ value,
110
+ start_posit + start,
111
+ start_posit + end,
112
+ 1
113
+ ])
114
+ df_result = nlp.f_concat_results(df_result, results)
115
+ return df_result
116
+
117
+ @staticmethod
118
+ def f_split_email_headers(header):
119
+ """ SPlit headers in from/to/date,...in a dictionnary
120
+
121
+ Args:
122
+ header:
123
+
124
+ Returns:
125
+
126
+ """
127
+ matching_header_keywords = {"à": "to",
128
+ "Destinataire": "to",
129
+ "de": "from",
130
+ "envoyé": "date",
131
+ "sent": "date",
132
+ "objet": "subject"}
133
+ dict_results = {}
134
+ pattern = r"((?<=\s|^)(à|À|a\p{M}|Cc|To|De|From|Envoy.|Date|Sent|Objet|Subject|Destinataire)\s?:)[ ]*((.*?)[ ]*((\n[ ]{3,7}?.*)*))(?=[\p{L}\p{M}]*\s{1,}:| > |\n|$)"
135
+ list_results = regex.finditer(pattern, header, flags=regex.IGNORECASE)
136
+ for match in list_results:
137
+ key_word = match.group(2).strip().lower()
138
+ key_word_matched = matching_header_keywords.get(key_word)
139
+ dict_results[key_word_matched if not key_word_matched is None else key_word] = [match.group(3),
140
+ match.span(3)[0],
141
+ match.span(3)[1]]
142
+ return dict_results
email_parser/models/model_signature_lstm_v10/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1df1ebcda9b9f2ca0855f67117d5c8b7db0d89c46c346273a536f2eec13c5665
3
+ size 22060
email_parser/models/model_signature_lstm_v10/minmax_scaler.p ADDED
Binary file (1.16 kB). View file
 
email_parser/models/model_signature_lstm_v10/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a28bac82659a6bc1cf949dc04d01a09db681cab64c9388ff1267d53fa3d11fb2
3
+ size 5272723
email_parser/models/model_signature_lstm_v10/standard_scaler.p ADDED
Binary file (584 Bytes). View file
 
email_parser/models/model_signature_lstm_v10/variables/variables.data-00000-of-00001 ADDED
Binary file (116 kB). View file
 
email_parser/models/model_signature_lstm_v10/variables/variables.index ADDED
Binary file (3.48 kB). View file
 
email_parser/nlp.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import regex
4
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ from . import utils, _models_signatures
9
+ from .utils import timing
10
+ from langid.langid import LanguageIdentifier
11
+ from langid.langid import model as model_langid
12
+
13
+ # Creating language_identifier object for usage in function f_detect_language
14
+ language_identifier = LanguageIdentifier.from_modelstring(model_langid, norm_probs=True)
15
+ language_identifier.set_languages(['en', 'fr'])
16
+
17
+
18
+ logging.info(f"Reading config file from folder:{os.path.join(os.path.dirname(__file__))}")
19
+
20
+ config = utils.f_read_config(os.path.join(os.path.dirname(__file__), 'config.ini'))
21
+
22
+ device = int(config["DEFAULT"]["device"])
23
+ default_lang = config["DEFAULT"]["default_lang"]
24
+
25
+ tokenizer_dict = {}
26
+ models_dict = {}
27
+ nlp_dict = {}
28
+
29
+
30
+ dict_regex_pattern = dict(EMAIL=r'[\p{L}\p{M}\-\d._]{1,}@[\p{L}\p{M}\d\-_]{1,}(\.[\p{L}\p{M}]{1,}){1,}',
31
+ TEL=r'(?<!\d)(\+?\d{1,2}[ -]?)?\(?\d{3}\)?[ .-]?\d{3}[ .-]?\d{4}(?!\d|\p{P}\d)',
32
+ POST=r'\b([A-z][0-9][A-z][ -]?[0-9][A-z][0-9]|[A-z][0-9][A-z])\b',
33
+ PRICE=r"(([\s:,]|^){1}\$*(CA|CAD|USD|EUR|GBP|\$|\€|\£|\¢){1}\$*[\d., ]*[\d]{1,}\b)" +
34
+ "|([\d]{1,}[\d., ]*(CA|CAD|USD|EUR|GBP|\$|\€|\£|k|m|\¢){1,}\$*(?=\s|\p{P}|$))",
35
+ WEB=r"((www(\.[\p{L}\p{M}\-0-9]]{1,}){2,})" +
36
+ "|(https?:[^ ]*)"+
37
+ # r"|(([\p{L}\p{M}\.]{3,}){2,})|"
38
+ r"|((?<=[\s:]|^)([\p{L}\p{M}\-0-9]{1,}\.){1,}(com|ca|org|fr){1,}\b))")
39
+ # WEB=r"(http(s)?:\/\/)?[a-z0-9]{1}[a-z0-9-._~]+[.]{1}(com|ca)(?![\p{L}\p{M}])")
40
+
41
+ def f_load_tokenizer_and_model_for_nlp(model_name, pipeline_type='ner'):
42
+ """
43
+ Loading model and tokenizer takes a long time.
44
+ We do it once and store the model and tokenizer in global dict for next usage
45
+ Args:
46
+ name: Name of the model that should be loaded and stored
47
+ pipeline_type: type of pipeline that should be initialized
48
+
49
+ Returns: tokenizer, model
50
+
51
+ """
52
+ global tokenizer_dict, models_dict, nlp_dict
53
+ auto_model = None
54
+ if pipeline_type == "ner":
55
+ auto_model = AutoModelForTokenClassification
56
+
57
+ if model_name not in tokenizer_dict.keys() or model_name not in models_dict.keys() or model_name not in nlp_dict.keys():
58
+ logging.info(
59
+ f"Loading tokenizer and model: {model_name}")
60
+ tokenizer_dict[model_name] = AutoTokenizer.from_pretrained(model_name)
61
+ # , add_prefix_space = True
62
+ models_dict[model_name] = auto_model.from_pretrained(model_name)
63
+ if pipeline_type == 'ner':
64
+ nlp_dict[model_name] = pipeline(pipeline_type, model=models_dict[model_name], tokenizer=tokenizer_dict[model_name],
65
+ aggregation_strategy="simple", device=device)
66
+
67
+
68
+ def f_ner(text, lang=default_lang):
69
+ df_result = f_ner_regex(text)
70
+ df_result = f_ner_model(text, lang=lang, df_result=df_result)
71
+ return df_result
72
+
73
+
74
+ @timing
75
+ def f_ner_model(text, lang=default_lang, df_result=pd.DataFrame()):
76
+ list_result = []
77
+ # We split the text by sentence and run model on each one
78
+ sentence_tokenizer = f_split_text_by_lines(text)
79
+ for start, end, value in sentence_tokenizer:
80
+ if value != "":
81
+ results = f_ner_model_by_sentence(value, lang=lang, pos_offset=start)
82
+ if len(results) != 0:
83
+ list_result += results
84
+ return f_concat_results(df_result, list_result)
85
+
86
+
87
+ @timing
88
+ def f_ner_model_by_sentence(sentence, lang=default_lang, df_result=pd.DataFrame(), pos_offset=0):
89
+ """ Run ner algorithm
90
+
91
+ Args:
92
+ sentence : sentence on which to run model
93
+ lang : lang to determine which model to use
94
+ df_result : If results of f_ner should be combined with previous value
95
+ (in this case we will keep the previous values if tags overlapsed)
96
+
97
+ Returns:
98
+ Dataframe with identified entities
99
+
100
+ """
101
+
102
+ if not config.has_option('DEFAULT', 'ner_model_' + lang):
103
+ raise ValueError(f"No model was defined for ner in {lang}")
104
+
105
+ model_name = config['DEFAULT']['ner_model_' + lang]
106
+ f_load_tokenizer_and_model_for_nlp(model_name)
107
+ logging.debug(f"starting {model_name} on sentence:'{sentence}'")
108
+
109
+ results = nlp_dict[model_name](sentence)
110
+ list_result = []
111
+ for result in results:
112
+ if result["word"] != "" and result['entity_group'] in ["PER", "LOC", "ORG", "DATE"]:
113
+
114
+ # Required because sometimes spaces are included in result["word"] value, but not in start/end position
115
+ value = sentence[result["start"]:result["end"]]
116
+
117
+ # We remove any special character at the beginning
118
+ pattern = r"[^.,'’` \":()\n].*"
119
+ result_regex = regex.search(pattern, value, flags=regex.IGNORECASE)
120
+
121
+ if result_regex is not None:
122
+ word_raw = result_regex.group()
123
+ word = word_raw
124
+ real_word_start = result["start"] + result_regex.start()
125
+ real_word_end = result["start"] + result_regex.start() + len(word_raw)
126
+ # We check if entity might be inside a longer word, if this is the case we ignore
127
+ letter_before = sentence[max(0, real_word_start - 1): real_word_start]
128
+ letter_after = sentence[real_word_end: min(len(sentence), real_word_end + 1)]
129
+ if regex.match(r"[A-z]", letter_before) or regex.match(r"[A-z]", letter_after):
130
+ logging.debug(f"Ignoring entity {value} because letter before is"
131
+ f" '{letter_before}' or letter after is '{letter_after}'")
132
+ continue
133
+
134
+ list_result.append(
135
+ [result["entity_group"],
136
+ word,
137
+ real_word_start + pos_offset,
138
+ real_word_end + pos_offset,
139
+ result["score"]])
140
+
141
+ return list_result
142
+
143
+
144
+ @timing
145
+ def f_concat_results(df_result, list_result_new):
146
+ """ Merge results between existing dataframe and a list of new values
147
+
148
+ Args:
149
+ df_result: dataframe of entities
150
+ list_result_new: list of new entities to be added in df_result
151
+
152
+ Returns:
153
+ Dataframe with all entities. Entities in list_result_new that were overlapping position of another entity in
154
+ df_result are ignored.
155
+
156
+ """
157
+ # If df_result and list_result_new are both empty, we return an empty dataframe
158
+ list_columns_names = ["entity", "value", "start", "end", "score"]
159
+ if (df_result is None or len(df_result) == 0) and (list_result_new is None or len(list_result_new) == 0):
160
+ return pd.DataFrame()
161
+ elif len(list_result_new) > 0:
162
+ if df_result is None or len(df_result) == 0:
163
+ return pd.DataFrame(list_result_new,
164
+ columns=list_columns_names)
165
+ list_row = []
166
+ for row in list_result_new:
167
+ df_intersect = df_result.query("({1}>=start and {0}<=end)".format(row[2], row[3]))
168
+ if len(df_intersect) == 0:
169
+ list_row.append(row)
170
+ df_final = pd.concat([df_result,
171
+ pd.DataFrame(list_row,
172
+ columns=list_columns_names)],
173
+ ignore_index=True) \
174
+ .sort_values(by="start")
175
+ return df_final
176
+ else:
177
+ # If list_result_new was empty we just return df_result
178
+ return df_result
179
+
180
+
181
+ @timing
182
+ def f_detect_language(text, default=default_lang):
183
+ """ Detect language
184
+
185
+ Args:
186
+ text: text on which language should be detected
187
+ default: default value if there is an error or score of predicted value is to low (default nlp.default_lang)
188
+
189
+ Returns:
190
+ "fr" or "en"
191
+
192
+ """
193
+ lang = default
194
+ try:
195
+ if text.strip() != "":
196
+ lang, score = language_identifier.classify(text.strip().replace("\n"," ").lower())
197
+ # If scroe is not high enough we will take default value instead
198
+ if score < 0.8:
199
+ lang = default_lang
200
+ except Exception as e:
201
+ logging.error("following error occurs when trying to detect language: {}".format(e))
202
+ finally:
203
+ return lang
204
+
205
+ @timing
206
+ def f_find_regex_pattern(text, type_, pattern):
207
+ """ Find all occurences of a pattern in a text and return a list of results
208
+ Args:
209
+ text: the text to be analyzed
210
+ type_: the entity type (value is added in result)
211
+ pattern: regex pattern to be found
212
+
213
+ Returns:
214
+ A list containing type, matched value, position start and end of each result
215
+
216
+ """
217
+ list_result = []
218
+ results = regex.finditer(pattern, text, flags=regex.IGNORECASE)
219
+ for match in results:
220
+ value = match.string[match.start(): match.end()].replace("\n", " ").strip()
221
+ list_result.append([type_,
222
+ value,
223
+ match.start(),
224
+ match.end(),
225
+ 1])
226
+ return list_result
227
+
228
+
229
+ @timing
230
+ def f_ner_regex(text, dict_pattern=dict_regex_pattern,
231
+ df_result=pd.DataFrame()):
232
+ """Run a series of regex expression to detect email, tel and postal codes in a full text.
233
+
234
+ Args:
235
+ text: the text to be analyzed
236
+ dict_pattern: dictionary of regex expression to be ran successively (default nlp.dict_regex_pattern)
237
+ df_result: results of this function will be merged with values provided here.
238
+ If value is already found at an overlapping position in df_results, the existing value will be kept
239
+
240
+ Returns:
241
+ Dataframe containing results merged with provided argument df_result (if any)
242
+ """
243
+ logging.debug("Starting regex")
244
+ list_result = []
245
+
246
+ # we run f_find_regex_pattern for each pattern in dict_regex
247
+ for type_, pattern in dict_pattern.items():
248
+ result = f_find_regex_pattern(text, type_, pattern)
249
+ if len(result) != 0:
250
+ list_result += result
251
+
252
+ df_result = f_concat_results(df_result, list_result)
253
+ return df_result
254
+
255
+ @timing
256
+ def f_split_text_by_lines(text, position_offset=0):
257
+ """
258
+ :param text: text that should be split
259
+ :return: list containing for each line: [position start, position end, sentence]
260
+ """
261
+ results = []
262
+ # iter_lines = regex.finditer(".*(?=\n|$)", text)
263
+ iter_lines = regex.finditer("[^>\n]((.*?([!?.>] ){1,})|.*(?=\n|$))", text)
264
+ for line_match in iter_lines:
265
+ start_line = line_match.start()
266
+ end_line = line_match.end()
267
+ line = line_match.group()
268
+ if len(line.strip()) > 1:
269
+ results.append([start_line + position_offset, end_line + position_offset, line])
270
+ return results
271
+
272
+
273
+ def f_detect_email_signature(text, df_ner=None, cut_off_score=0.6, lang=default_lang):
274
+ # with tf.device("/cpu:0"):
275
+ if text.strip() == "":
276
+ return None
277
+ if df_ner is None:
278
+ df_ner = f_ner(text, lang=lang)
279
+
280
+ df_features = _models_signatures.f_create_email_lines_features(text, df_ner=df_ner)
281
+
282
+ if len(df_features)==0:
283
+ return None
284
+
285
+ # We add dummy value for signature in order to use same function than for training of the model
286
+ df_features["is_signature"] = -2
287
+
288
+ x, y_out, _, _ = _models_signatures.generate_x_y(df_features, _models_signatures.minmax_scaler,
289
+ _models_signatures.standard_scaler)
290
+
291
+ y_predict = _models_signatures.model.predict(x)
292
+ y_predict_value = (y_predict> cut_off_score).reshape([-1])
293
+ y_predict_value = np.pad(y_predict_value, (len(df_features) - len(y_predict_value), 0), constant_values=0)[
294
+ -len(df_features):]
295
+ y_predict_score = y_predict.reshape([-1])
296
+ y_predict_score = np.pad(y_predict_score, (len(df_features) - len(y_predict_score), 0), constant_values=1)[
297
+ -len(df_features):]
298
+
299
+ # return(y_predict, y_mask)
300
+ df_features["prediction"] = y_predict_value
301
+ df_features["score"] = y_predict_score
302
+ # return df_features
303
+ series_position_body = df_features.query(f"""prediction==0""")['end']
304
+ if len(series_position_body) > 0:
305
+ body_end_pos = max(series_position_body)
306
+ else:
307
+ # In this case everything was detected as a signature
308
+ body_end_pos = 0
309
+ score = df_features.query(f"""prediction==1""")["score"].mean()
310
+ signature_text = text[body_end_pos:].strip().replace("\n", " ")
311
+ if signature_text != "":
312
+ list_result = [
313
+ # ["body", text[:body_end_pos], 0 + pos_start_email, body_end_pos + pos_start_email, 1, ""],
314
+ ["SIGNATURE", signature_text, body_end_pos, len(text), score]]
315
+
316
+ df_result = f_concat_results(pd.DataFrame(), list_result)
317
+ else:
318
+ df_result = None
319
+
320
+ return df_result
321
+
322
+
email_parser/utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import wraps
2
+ import logging
3
+ import os
4
+ from time import time
5
+ import configparser
6
+
7
+ timer_functions = {}
8
+
9
+ # Loading configuration from config file
10
+ config = configparser.ConfigParser()
11
+ config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
12
+
13
+
14
+ def timing(f):
15
+ @wraps(f)
16
+ def wrap(*args, **kw):
17
+ ts = time()
18
+ result = f(*args, **kw)
19
+ te = time()
20
+ if f.__name__ in timer_functions.keys():
21
+ current_elapsed_time = timer_functions[f.__name__]
22
+ else:
23
+ current_elapsed_time = 0
24
+ timer_functions[f.__name__] = current_elapsed_time + (te - ts)
25
+ logging.debug('func:%r took: %2.4f sec' % \
26
+ (f.__name__, te - ts))
27
+ return result
28
+ return wrap
29
+
30
+
31
+ def f_read_config(path=None):
32
+ """ read config file from specified file path
33
+
34
+ :param path: file path
35
+ :return: configparser object
36
+ """
37
+ # Loading configuration from config file
38
+ config = configparser.ConfigParser()
39
+ if path is None:
40
+ path = os.path.join(os.path.dirname(__file__), 'config.ini')
41
+ config.read(path, encoding='utf-8')
42
+ return config
43
+
44
+ def f_setup_logger(level_sysout=logging.INFO, level_file=logging.DEBUG, folder_path="logs"):
45
+ """Setup logger
46
+
47
+ By default we display only INFO in console, and write everything in file
48
+
49
+ Args:
50
+ level_sysout: Level that is displayed in console (default INFO)
51
+ level_file: Level that is written in file (default DEBUG)
52
+
53
+ Returns:
54
+ Nothing
55
+
56
+ """
57
+ if not os.path.isdir(folder_path):
58
+ os.mkdir(folder_path)
59
+
60
+ for handler in logging.root.handlers[:]:
61
+ logging.root.removeHandler(handler)
62
+
63
+ file_handler = logging.FileHandler(filename=os.path.join(folder_path, "amf_uce_nlp_{}.log".format(time())),
64
+ encoding='utf-8')
65
+ sysout_handler = logging.StreamHandler()
66
+ file_handler.setLevel(level_file)
67
+ sysout_handler.setLevel(level_sysout)
68
+ logging.basicConfig(handlers=[file_handler, sysout_handler], level=logging.DEBUG,
69
+ format='%(asctime)s (%(levelname)s) %(message)s', datefmt='%m/%d/%y %I:%M:%S %p')
70
+
71
+
72
+ def get_model_full_path(model_name):
73
+ path_models = config["DEFAULT"]["path_models"]
74
+ return os.path.join(os.path.dirname(__file__), path_models, model_name)
setup.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import find_packages, setup
2
+ from glob import glob
3
+ import os
4
+
5
+
6
+ setup(name='email_parser',
7
+ packages=find_packages(include=['email_parser']),
8
+ version='0.0.1',
9
+ description='Email parser',
10
+ author='JB Polle',
11
+ license='MIT',
12
+ install_requires=['langid==1.1.6',
13
+ 'numpy>=1.19.5',
14
+ 'pandas>=1.2.3',
15
+ 'regex',
16
+ 'scikit-learn==0.24.1',
17
+ 'sentence-transformers==1.0.4',
18
+ 'tensorflow==2.6.0',
19
+ 'tensorflow-hub>=0.12.0',
20
+ 'tensorflow-text==2.6.0',
21
+ 'tokenizers==0.10.1',
22
+ 'torch>=1.8.0',
23
+ 'umap-learn==0.5.1',
24
+ 'dateparser==1.0.0',
25
+ 'transformers>=4.3',
26
+ 'gradio>=2.7'])