dexay commited on
Commit
a985cef
Β·
1 Parent(s): a7c2516

Create new file

Browse files
Files changed (1) hide show
  1. postt.py +66 -0
postt.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def postcor(blist):
4
+ crlist = blist
5
+ remrow = []
6
+ for i in range(len(crlist)-1):
7
+ if len(crlist[i][0]) <= 1 or len(crlist[i][1]) <= 1 or crlist[i][2] == "other":
8
+ remrow += [crlist[i]]
9
+ continue
10
+ xlt = re.findall(r'[a-zA-Z]', crlist[i][0])
11
+ if len(xlt)==0:
12
+ remrow += [crlist[i]]
13
+ continue
14
+ xlt = re.findall(r'[a-zA-Z]', crlist[i][1])
15
+ if len(xlt) == 0:
16
+ remrow += [crlist[i]]
17
+ continue
18
+
19
+
20
+ for j in range(i+1,len(crlist)):
21
+ if re.sub(r"(-|'| |_)", "", crlist[i][0]).lower() == re.sub(r"(-|'| |_)", "", crlist[j][0]).lower():
22
+ if len(crlist[i][0]) < len(crlist[j][0]):
23
+ crlist[j][0] = crlist[i][0]
24
+ elif len(crlist[i][0]) > len(crlist[j][0]):
25
+ crlist[i][0] = crlist[j][0]
26
+
27
+ if re.sub(r"(-|'| |_)", "", crlist[i][1]).lower() == re.sub(r"(-|'| |_)", "", crlist[j][1]).lower():
28
+ if len(crlist[i][1]) < len(crlist[j][1]):
29
+ crlist[j][1] = crlist[i][1]
30
+ elif len(crlist[i][1]) > len(crlist[j][1]):
31
+ crlist[i][1] = crlist[j][1]
32
+
33
+ if len(crlist[i][0])-len(crlist[j][0]) == 1 and crlist[j][0] in crlist[i][0] and crlist[i][0][-1] == "s":
34
+ crlist[i][0] = crlist[j][0]
35
+ elif len(crlist[i][0])-len(crlist[j][0]) == -1 and crlist[i][0] in crlist[j][0] and crlist[j][0][-1] == "s":
36
+ crlist[j][0] = crlist[i][0]
37
+ if len(crlist[i][1])-len(crlist[j][1]) == 1 and crlist[j][1] in crlist[i][1] and crlist[i][1][-1] == "s":
38
+ crlist[i][0] = crlist[j][0]
39
+ elif len(crlist[i][1])-len(crlist[j][1]) == -1 and crlist[i][1] in crlist[j][1] and crlist[j][1][-1] == "s":
40
+ crlist[j][1] = crlist[i][1]
41
+
42
+ for rw in remrow:
43
+ crlist.remove(rw)
44
+
45
+ return crlist
46
+
47
+
48
+ def precor(text):
49
+ lines = text
50
+ lines = lines.replace("breast and prostate cancer","breast cancer and prostate cancer")
51
+ lines = lines.replace("prostate and breast cancer","prostate cancer and breast cancer")
52
+ lines = lines.replace("breast, prostate and ovarian cancer","breast cancer, prostate cancer and ovarian cancer")
53
+ lines = re.sub(r"\[\d*\]", "", lines) # notes
54
+ lines = re.sub(r'\(\s?(figure|Figure|table|Table|fig\.|Fig\.|tab\.|Tab\.)(\s?\w)*\s?\)',"", lines) # (figure)|(table)
55
+ lines = re.sub(r'www\.(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "", lines) # www.ex.com
56
+ lines = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "", lines) # http urls
57
+ lines = re.sub(r'[a-zA-Z0-9]+[_\-.]*[a-zA-Z0-9]*@[a-zA-Z0-9]+\.\w+', "", lines) # emails
58
+ lines = re.sub(r'\(\)', "", lines)
59
+ lines = re.sub(r'\[\s?[0-9]+(\–|,|\-)\s?[0-9]+\s?\]', "", lines)
60
+ lines = re.sub(r'\(\s?[0-9]+(\–|,|\-)\s?[0-9]+\s?\)', "", lines)
61
+ lines = re.sub(r'\[\s?[0-9]+(\–|,|\-)?\s?[0-9]*\s?\]', "", lines)
62
+ lines = re.sub(r'\(\s?[0-9]+(\–|,|\-)?\s?[0-9]*\s?\)', "", lines)
63
+ punc = ";.,?([)]"
64
+ for e in punc:
65
+ lines = lines.replace(e, " "+e+" ")
66
+ return lines