Spaces:
Running
Running
File size: 6,261 Bytes
79e12fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import regex
import pandas as pd
from . import nlp
class Email:
def __init__(self,
raw_text):
""" Constructor for email
:param raw_text: raw text of email
"""
self.raw_text = raw_text
self.list_emails = self.f_split_email_thread()
def f_split_email_thread(self):
""" Function to split a thread of email into a list of individual email.
Two main formats of header are recognized:
1) Multi-lines header similar to
De : sads Cadsfdsf [mailto:sdadsad@google.ca]
Envoyé : 30 mars 2015 08:33
À : asdsad, sadsadasd (CA - asdasd)
Objet : Re: TR: sadasdasdsad sa dsa
2) Le 2015-03-30 à 08:25, Luc, Archambault (CA - Drummondville) <larchambault@google.ca> a écrit :
Returns:
list of dict. Dict contains for each email: (body, header, start, start_header, date, lang)
"""
pattern = r"(((\n{1}\t*|\n(-{4,}.*-{4,}\s*)|^)(([> *]*(de|from|Exp.diteur|Subject)[\s]*:).*(\n[^A-Z].*)?[\r\n\t\s,]{1,}){1,})(([> *\t]*[\p{L}\p{M}' -]*[\s]*:).*((\n[ ]{3,7}?.*|(\n<.*))*)[\r\n\t\s,]{1,3}?){2,}" \
r"|(\s*((((de|from|Exp.diteur|Subject)[\s]*:).{0,200}?[\r\n\t\s,]{1,}){1})(?!de)(((envoy.|.|to|date).?[\s]*:).*?){1,}(((objet|subject)[\s]*:).*?[!?.><,]){1})" \
r"|((?<=\n)(([ >\t]*)(le|on|el).{0,30}\d{4,}.{0,100}\n*.{0,100}(wrote|.crit|escribió)\s*:))" \
r"|(\b(le|on)\s*((\d{2,4}[- ]){3}|(\d{1,2}.{1,8}\d{4}))[^\n]*?(wrote|.crit)\s*:)" \
r"|$)"
results = regex.finditer(pattern, self.raw_text, flags=regex.IGNORECASE)
start_of_current_header = 0
end_of_current_header = 0
part_email = 1
if results is not None:
list_email = []
for result in results:
start_of_next_header = result.start()
# if header_group is not None and full_email[0:header_group.start()].lstrip() == "":
if start_of_current_header != end_of_current_header:
header = self.raw_text[start_of_current_header: end_of_current_header]
body = self.raw_text[end_of_current_header:start_of_next_header]
start = end_of_current_header
start_header = start_of_current_header
# Case where no header was found (either last email of thread or regex didn't find it)
else:
header = ""
body = self.raw_text[end_of_current_header:start_of_next_header]
start = end_of_current_header
start_header = start_of_current_header
# we detect language for each email of the thread and default to detected thread language otherwise
# We detect only on first 150 characters
lang = nlp.f_detect_language(body[:150])
if body.strip() != "" or header != "":
list_email.append({"body": body,
"header": header,
"start": start,
"start_header": start_header,
"lang": lang,
"part": part_email
})
part_email += 1
# previous_from_tag = current_from_tag
start_of_current_header = result.start()
end_of_current_header = result.end()
return list_email
# Case were mail is not a thread
else:
return [{"body": self.raw_text,
"header": "",
"start": 0}]
@staticmethod
def f_find_person_in_header(header, df_result=pd.DataFrame()):
results = []
dict_header = Email.f_split_email_headers(header)
for key in ["to", "cc", "from"]:
if key in dict_header.keys():
line_header = dict_header[key][0]
start_posit = dict_header[key][1]
pattern_person = r"(?<=\s|'|^)[\p{L}\p{M}\s,-]{2,}(?=[\s;']|$)"
list_results = regex.finditer(pattern_person, line_header, flags=regex.IGNORECASE)
for match in list_results:
value = match.group()
if value.strip() != "":
start = match.start()
end = match.end()
results.append(["PER",
value,
start_posit + start,
start_posit + end,
1
])
df_result = nlp.f_concat_results(df_result, results)
return df_result
@staticmethod
def f_split_email_headers(header):
""" SPlit headers in from/to/date,...in a dictionnary
Args:
header:
Returns:
"""
matching_header_keywords = {"à": "to",
"Destinataire": "to",
"de": "from",
"envoyé": "date",
"sent": "date",
"objet": "subject"}
dict_results = {}
pattern = r"((?<=\s|^)(à|À|a\p{M}|Cc|To|De|From|Envoy.|Date|Sent|Objet|Subject|Destinataire)\s?:)[ ]*((.*?)[ ]*((\n[ ]{3,7}?.*)*))(?=[\p{L}\p{M}]*\s{1,}:| > |\n|$)"
list_results = regex.finditer(pattern, header, flags=regex.IGNORECASE)
for match in list_results:
key_word = match.group(2).strip().lower()
key_word_matched = matching_header_keywords.get(key_word)
dict_results[key_word_matched if not key_word_matched is None else key_word] = [match.group(3),
match.span(3)[0],
match.span(3)[1]]
return dict_results
|