File size: 6,261 Bytes
79e12fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import regex
import pandas as pd

from . import nlp



class Email:

    def __init__(self,
                 raw_text):
        """ Constructor for email
        :param raw_text: raw text of email
        """
        self.raw_text = raw_text
        self.list_emails = self.f_split_email_thread()

    def f_split_email_thread(self):
        """ Function to split a thread of email into a list of individual email.

        Two main formats of header are recognized:

        1) Multi-lines header similar to
                De : sads Cadsfdsf [mailto:sdadsad@google.ca]
                Envoyé : 30 mars 2015 08:33
                À : asdsad, sadsadasd (CA - asdasd)
                Objet : Re: TR: sadasdasdsad sa dsa
        2) Le 2015-03-30 à 08:25, Luc, Archambault (CA - Drummondville) <larchambault@google.ca> a écrit :

        Returns:
            list of dict. Dict contains for each email: (body, header, start, start_header, date, lang)

        """

        pattern = r"(((\n{1}\t*|\n(-{4,}.*-{4,}\s*)|^)(([> *]*(de|from|Exp.diteur|Subject)[\s]*:).*(\n[^A-Z].*)?[\r\n\t\s,]{1,}){1,})(([> *\t]*[\p{L}\p{M}' -]*[\s]*:).*((\n[ ]{3,7}?.*|(\n<.*))*)[\r\n\t\s,]{1,3}?){2,}" \
                  r"|(\s*((((de|from|Exp.diteur|Subject)[\s]*:).{0,200}?[\r\n\t\s,]{1,}){1})(?!de)(((envoy.|.|to|date).?[\s]*:).*?){1,}(((objet|subject)[\s]*:).*?[!?.><,]){1})" \
                  r"|((?<=\n)(([ >\t]*)(le|on|el).{0,30}\d{4,}.{0,100}\n*.{0,100}(wrote|.crit|escribió)\s*:))" \
                  r"|(\b(le|on)\s*((\d{2,4}[- ]){3}|(\d{1,2}.{1,8}\d{4}))[^\n]*?(wrote|.crit)\s*:)" \
                  r"|$)"

        results = regex.finditer(pattern, self.raw_text, flags=regex.IGNORECASE)
        start_of_current_header = 0
        end_of_current_header = 0
        part_email = 1

        if results is not None:
            list_email = []

            for result in results:

                start_of_next_header = result.start()

                # if header_group is not None and full_email[0:header_group.start()].lstrip() == "":
                if start_of_current_header != end_of_current_header:
                    header = self.raw_text[start_of_current_header: end_of_current_header]
                    body = self.raw_text[end_of_current_header:start_of_next_header]

                    start = end_of_current_header
                    start_header = start_of_current_header

                # Case where no header was found (either last email of thread or regex didn't find it)
                else:
                    header = ""
                    body = self.raw_text[end_of_current_header:start_of_next_header]
                    start = end_of_current_header
                    start_header = start_of_current_header


                #  we detect language for each email of the thread and default to detected thread language otherwise
                # We detect only on first 150 characters
                lang = nlp.f_detect_language(body[:150])

                if body.strip() != "" or header != "":
                    list_email.append({"body": body,
                                       "header": header,
                                       "start": start,
                                       "start_header": start_header,
                                       "lang": lang,
                                       "part": part_email
                                       })
                    part_email += 1
                # previous_from_tag = current_from_tag
                start_of_current_header = result.start()
                end_of_current_header = result.end()

            return list_email
        # Case were mail is not a thread
        else:
            return [{"body": self.raw_text,
                     "header": "",
                     "start": 0}]

    @staticmethod
    def f_find_person_in_header(header, df_result=pd.DataFrame()):
        results = []
        dict_header = Email.f_split_email_headers(header)
        for key in ["to", "cc", "from"]:
            if key in dict_header.keys():
                line_header = dict_header[key][0]
                start_posit = dict_header[key][1]
                pattern_person = r"(?<=\s|'|^)[\p{L}\p{M}\s,-]{2,}(?=[\s;']|$)"
                list_results = regex.finditer(pattern_person, line_header, flags=regex.IGNORECASE)
                for match in list_results:
                    value = match.group()
                    if value.strip() != "":
                        start = match.start()
                        end = match.end()
                        results.append(["PER",
                                          value,
                                          start_posit + start,
                                          start_posit + end,
                                          1
                                          ])
        df_result = nlp.f_concat_results(df_result, results)
        return df_result

    @staticmethod
    def f_split_email_headers(header):
        """ SPlit headers in from/to/date,...in a dictionnary

        Args:
            header:

        Returns:

        """
        matching_header_keywords = {"à": "to",
                                    "Destinataire": "to",
                                    "de": "from",
                                    "envoyé": "date",
                                    "sent": "date",
                                    "objet": "subject"}
        dict_results = {}
        pattern = r"((?<=\s|^)(à|À|a\p{M}|Cc|To|De|From|Envoy.|Date|Sent|Objet|Subject|Destinataire)\s?:)[ ]*((.*?)[ ]*((\n[ ]{3,7}?.*)*))(?=[\p{L}\p{M}]*\s{1,}:| > |\n|$)"
        list_results = regex.finditer(pattern, header, flags=regex.IGNORECASE)
        for match in list_results:
            key_word = match.group(2).strip().lower()
            key_word_matched = matching_header_keywords.get(key_word)
            dict_results[key_word_matched if not key_word_matched is None else key_word] = [match.group(3),
                                                                                            match.span(3)[0],
                                                                                            match.span(3)[1]]
        return dict_results