Projeto commited on
Commit
1f38760
1 Parent(s): c5cb5c8

Create clean_functions.py

Browse files
Files changed (1) hide show
  1. legalnlp/clean_functions.py +94 -0
legalnlp/clean_functions.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ import ftfy
4
+ from legalnlp.mask_functions import *
5
+
6
+
7
+ def clean_bert(text):
8
+ """
9
+ Cleans a text based on bad Unicode and other characters
10
+ Parameters
11
+ -----------
12
+ texto: str
13
+ A piece of text
14
+ Returns
15
+ -----------
16
+ str
17
+ Fixed text
18
+ """
19
+
20
+ txt = ftfy.fix_text(text)
21
+ txt = txt.replace("\n", " ")
22
+ txt = re.sub(' +', ' ', txt)
23
+ return(txt)
24
+
25
+
26
+ def clean(text, lower=True, return_masked=False):
27
+ """
28
+ Cleans a text by removing general patterns, such as url, email, acronyms and other symbols, plural
29
+ of words and specific Portuguese-related grammar
30
+ Parameters
31
+ -----------
32
+ texto: str
33
+ A piece of text
34
+ lower: bool
35
+ Whether to lowercase text (Default: True)
36
+ return_masked: bool
37
+ If return_masked == False, the function outputs a clean text. Otherwise, it returns a dictionary containing the clean text and the information extracted by RegEx (Default: False)
38
+ Returns
39
+ -----------
40
+ dict or str
41
+
42
+ """
43
+
44
+ dic = {}
45
+
46
+ # Limpeza geral
47
+ dic['txt'], dic['url'] = mask_url(text) # Remove URLs
48
+ dic['txt'], dic['email'] = mask_email(dic['txt']) # Remove emails
49
+ # Siglas (e.g., C.P.F => CPF)
50
+ dic['txt'] = re.sub("([A-Z])\.", r"\1", dic['txt'])
51
+ if lower:
52
+ dic['txt'] = dic['txt'].lower() # Tornando letras minúsculas
53
+ dic['txt'] = re.sub("s[\/\.]a", " sa ", dic['txt'],
54
+ flags=re.I) # s.a ou s/a => sa
55
+ dic['txt'] = dic['txt'].replace(" - - ", " - ")
56
+ dic['txt'] = dic['txt'].replace(" - ", " - - ")
57
+ # Colocando espaço aos lados dos símbolos
58
+ dic['txt'] = re.sub("(\W)", r" \1 ", dic['txt'])
59
+ dic['txt'] = dic['txt'].replace("\n", " ")
60
+ dic['txt'] = dic['txt'].replace("\t", " ")
61
+
62
+ # Possíveis plurais e gênero
63
+ dic['txt'] = dic['txt'].replace("( s )", "(s)")
64
+ dic['txt'] = dic['txt'].replace("( a )", "(a)")
65
+ dic['txt'] = dic['txt'].replace("( as )", "(as)")
66
+ dic['txt'] = dic['txt'].replace("( o )", "(o)")
67
+ dic['txt'] = dic['txt'].replace("( os )", "(os)")
68
+
69
+ # Juntando algumas strings
70
+ dic['txt'] = re.sub("(?<=\d) [-\.] (?=\d)", '', dic['txt'])
71
+ dic['txt'] = re.sub("(?<=\d) , (?=\d)", ',', dic['txt'])
72
+ dic['txt'] = dic['txt'].replace("[ email ]", "[email]")
73
+ dic['txt'] = dic['txt'].replace("[ url ]", "[url]")
74
+ # (e.g., arquivem - se => arquivem-se)
75
+ dic['txt'] = re.sub("(\w) - (\w)", r"\1-\2", dic['txt'])
76
+ dic['txt'] = re.sub(' +', ' ', dic['txt'])
77
+
78
+ # Mascarando
79
+ dic['txt'], dic['oab'] = mask_oab(dic['txt'])
80
+ dic['txt'], dic['data'] = mask_data(dic['txt'])
81
+ dic['txt'], dic['processo'] = mask_processo(dic['txt'])
82
+ # Consideramos que as casas decimais são dadas pela vírgula
83
+ dic['txt'], dic['valor'] = mask_valor(dic['txt'])
84
+ dic['txt'], dic['numero'] = mask_numero(dic['txt'])
85
+
86
+ # Extra spaces
87
+ dic['txt'] = re.sub(' +', ' ', dic['txt'])
88
+ dic['txt'] = dic['txt'].strip()
89
+
90
+ # Output
91
+ if return_masked:
92
+ return dic
93
+ else:
94
+ return dic['txt']