TymaaHammouda commited on
Commit
6d2b5d1
·
verified ·
1 Parent(s): 2551344

Upload 3 files

Browse files
Files changed (3) hide show
  1. IBO_to_XML.py +135 -0
  2. NER_Distiller.py +138 -0
  3. XML_to_HTML.py +32 -0
IBO_to_XML.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By Wasim Khatib
2
+ # Version 2.0
3
+ # This function take a list a set of annotated entities, in this format: [["صرح","O"],
4
+ # ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
5
+ # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
6
+ # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
7
+ # after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
8
+ # جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
9
+ # This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
10
+ # start with ignore I- tags if they don’t have B-tags.
11
+ import numpy as np
12
+
13
+
14
+ def IBO_to_XML(temp):
15
+ xml_output = ""
16
+
17
+ temp_entities = sortTags(temp)
18
+
19
+ temp_list = list()
20
+
21
+ # initlize the temp_list
22
+ temp_list.append("")
23
+ word_position = 0
24
+
25
+ # For each entity, convert ibo to xml list.
26
+ for entity in temp_entities:
27
+ counter_tag = 0
28
+ # For each tag
29
+ for tag in str(entity[1]).split():
30
+
31
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
32
+ if counter_tag >= len(temp_list):
33
+ temp_list.append("")
34
+
35
+ # If the tag is equal O then and word position not equal zero then add all from templist to output ist
36
+ if "O" == tag and word_position != 0:
37
+ for j in range(len(temp_list),0,-1):
38
+ if temp_list[j-1]!= "":
39
+ xml_output+=" </"+str(temp_list[j-1])+">"
40
+ temp_list[j-1] = ""
41
+
42
+ # if its not equal O and its correct tag like B-tag or I-tag and its B
43
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
44
+ # if the templist of counter tag is not empty then we need add xml word that contains
45
+ # </name of previous tag> its mean that we closed the tag in xml in xml_output
46
+ if temp_list[counter_tag] != "":
47
+ xml_output+=" </"+str(temp_list[counter_tag])+">"
48
+ # After that we replace the previous tag from templist in new tag
49
+ temp_list[counter_tag] = str(tag).split("-")[1]
50
+ # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
51
+ xml_output += " <" + str(temp_list[counter_tag]) + ">"
52
+
53
+
54
+
55
+ # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
56
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
57
+ # we need to check if this tag like previous tag
58
+ for j in range(counter_tag,len(temp_list)):
59
+ # if its equal then will break the loop and continue
60
+ if temp_list[j] == tag[2:]:
61
+ break
62
+ # if not then we need to add xml word to close the tag like </name of previous> in xml_output
63
+ else:
64
+ if temp_list[j] != "":
65
+ xml_output+=" </"+str(temp_list[j])+">"
66
+ temp_list[j] = ""
67
+ counter_tag += 1
68
+ word_position += 1
69
+ # Add word in xml_output
70
+ xml_output +=" "+str(entity[0])
71
+ # Add all xml words in xml_output
72
+ for j in range(0, len(temp_list)):
73
+ if temp_list[j] != "":
74
+ xml_output+=" </"+str(temp_list[j])+">"
75
+ return xml_output.strip()
76
+
77
+
78
+ def sortTags(entities):
79
+ temp_entities = entities
80
+ temp_counter = 0
81
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
82
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
83
+ for entity in temp_entities:
84
+ tags = entity[1].split()
85
+ for tag in tags:
86
+ # if the counter is not 0 then, will complete
87
+ if temp_counter != 0:
88
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
89
+ # count how many tag in previous tags
90
+ if "I-" == tag[0:2]:
91
+ counter_of_this_tag = 0
92
+ counter_of_previous_tag = 0
93
+ for word in tags:
94
+ if tag.split("-")[1] in word:
95
+ counter_of_this_tag+=1
96
+ for word in temp_entities[temp_counter-1][1].split():
97
+ if tag.split("-")[1] in word:
98
+ counter_of_previous_tag+=1
99
+ # if the counter of previous tag is bigger than counter of this tag, then we
100
+ # need to add I-tag in this tags
101
+ if counter_of_previous_tag > counter_of_this_tag:
102
+ tags.append("I-"+tag.split("-")[1])
103
+ # Sort the tags
104
+ tags.sort()
105
+ # Need to revers the tags because it should begins with I
106
+ tags.reverse()
107
+ # If the counter is not 0 then we can complete
108
+ if temp_counter != 0:
109
+ this_tags = tags
110
+ previous_tags = temp_entities[temp_counter - 1][1].split()
111
+ sorted_tags = list()
112
+
113
+ # Check if the this tag is not O and previous tags is not O, then will complete,
114
+ # if not then it will ignor this tag
115
+ if "O" not in this_tags and "O" not in previous_tags:
116
+ index = 0
117
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
118
+ for i in previous_tags:
119
+ j = 0
120
+ while this_tags and j < len(this_tags):
121
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
122
+ sorted_tags.insert(index, this_tags.pop(j))
123
+ break
124
+ elif this_tags[j][0:2] == "B-":
125
+ break
126
+ j += 1
127
+ index += 1
128
+ sorted_tags += this_tags
129
+ tags = sorted_tags
130
+ str_tag = " "
131
+ str_tag = str_tag.join(tags)
132
+ str_tag = str_tag.strip()
133
+ temp_entities[temp_counter][1] = str_tag
134
+ temp_counter += 1
135
+ return temp_entities
NER_Distiller.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By Wasim Khatib
2
+ # Version 2.0
3
+ # This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
4
+ # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
5
+ # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
6
+ # after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
7
+ # [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
8
+ # [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
9
+ def distill_entities(entities):
10
+ # This is list that we put the output what we need
11
+ list_output = list()
12
+
13
+ # This line go to sort function and save the output to temp_entities
14
+ temp_entities = sortTags(entities)
15
+
16
+ # This list help us to make the output,
17
+ temp_list = list()
18
+
19
+ # initlize the temp_list
20
+ temp_list.append(["", "", 0, 0])
21
+ word_position = 0
22
+
23
+ # For each entity, convert ibo to distllir list.
24
+ for entity in temp_entities:
25
+ # This is counter tag of this entity
26
+ counter_tag = 0
27
+ # For each tag
28
+ for tag in str(entity[1]).split():
29
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
30
+ if counter_tag >= len(temp_list):
31
+ temp_list.append(["", "", 0, 0])
32
+
33
+ # If tag equal O and word postion of this tag is not equal zero then it will add all
34
+ # not empty eliment of temp list in output list
35
+ if "O" == tag and word_position != 0:
36
+ for j in range(0, len(temp_list)):
37
+ if temp_list[j][1] != "":
38
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
39
+ temp_list[j][0] = ""
40
+ temp_list[j][1] = ""
41
+ temp_list[j][2] = word_position
42
+ temp_list[j][3] = word_position
43
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
44
+ # of the split its B
45
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
46
+ # if the temp_list of counter is not empty then it will append in output list and hten it will
47
+ # initilize by new string and tag in templist of counter
48
+ if temp_list[counter_tag][1] != "":
49
+ list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
50
+ temp_list[counter_tag][0] = str(entity[0]) + " "
51
+ temp_list[counter_tag][1] = str(tag).split("-")[1]
52
+ temp_list[counter_tag][2] = word_position
53
+ temp_list[counter_tag][3] = word_position
54
+
55
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
56
+ # of the split its O
57
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
58
+ # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
59
+ # then will complete if not it will save in output list and cheak another
60
+ for j in range(counter_tag,len(temp_list)):
61
+ if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
62
+ temp_list[j][0] += str(entity[0]) + " "
63
+ temp_list[j][3] += 1
64
+ break
65
+ else:
66
+ if temp_list[j][1] != "":
67
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
68
+ temp_list[j][0] = ""
69
+ temp_list[j][1] = ""
70
+ temp_list[j][2] = word_position
71
+ temp_list[j][3] = word_position
72
+ counter_tag += 1
73
+ word_position += 1
74
+ # For each temp_list, at the end of the previous loop, there will be some
75
+ # values in this list, we should save it to the output list
76
+ for j in range(0, len(temp_list)):
77
+ if temp_list[j][1] != "":
78
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
79
+ return sorted(list_output, key=lambda x: (x[2]))
80
+
81
+ def sortTags(entities):
82
+ temp_entities = entities
83
+ temp_counter = 0
84
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
85
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
86
+ for entity in temp_entities:
87
+ tags = entity[1].split()
88
+ for tag in tags:
89
+ # if the counter is not 0 then, will complete
90
+ if temp_counter != 0:
91
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
92
+ # count how many tag in previous tags
93
+ if "I-" == tag[0:2]:
94
+ counter_of_this_tag = 0
95
+ counter_of_previous_tag = 0
96
+ for word in tags:
97
+ if tag.split("-")[1] in word:
98
+ counter_of_this_tag+=1
99
+ for word in temp_entities[temp_counter-1][1].split():
100
+ if tag.split("-")[1] in word:
101
+ counter_of_previous_tag+=1
102
+ # if the counter of previous tag is bigger than counter of this tag, then we
103
+ # need to add I-tag in this tags
104
+ if counter_of_previous_tag > counter_of_this_tag:
105
+ tags.append("I-"+tag.split("-")[1])
106
+ # Sort the tags
107
+ tags.sort()
108
+ # Need to revers the tags because it should begins with I
109
+ tags.reverse()
110
+ # If the counter is not 0 then we can complete
111
+ if temp_counter != 0:
112
+ this_tags = tags
113
+ previous_tags = temp_entities[temp_counter - 1][1].split()
114
+ sorted_tags = list()
115
+
116
+ # Check if the this tag is not O and previous tags is not O, then will complete,
117
+ # if not then it will ignor this tag
118
+ if "O" not in this_tags and "O" not in previous_tags:
119
+ index = 0
120
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
121
+ for i in previous_tags:
122
+ j = 0
123
+ while this_tags and j < len(this_tags):
124
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
125
+ sorted_tags.insert(index, this_tags.pop(j))
126
+ break
127
+ elif this_tags[j][0:2] == "B-":
128
+ break
129
+ j += 1
130
+ index += 1
131
+ sorted_tags += this_tags
132
+ tags = sorted_tags
133
+ str_tag = " "
134
+ str_tag = str_tag.join(tags)
135
+ str_tag = str_tag.strip()
136
+ temp_entities[temp_counter][1] = str_tag
137
+ temp_counter += 1
138
+ return temp_entities
XML_to_HTML.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def NER_XML_to_HTML(xml):
5
+ html = re.sub(r'WORK_OF_ART','WORKOFART',xml)
6
+
7
+ # replace every end tag with end span tag "</span>"
8
+ html = re.sub(r'</[A-Z]+>','</span>',html)
9
+
10
+ # replace every start tag with the appropriate css class
11
+ html = re.sub(r'<PERS>','<span class="ner_pers" data-entity="PERS">',html)
12
+ html = re.sub(r'<GROUP>','<span class="ner_group" data-entity="NORP">',html)
13
+ html = re.sub(r'<OCC>','<span class="ner_occ" data-entity="OCC">',html)
14
+ html = re.sub(r'<ORG>','<span class="ner_org" data-entity="ORG">',html)
15
+ html = re.sub(r'<LOC>','<span class="ner_loc" data-entity="LOC">',html)
16
+ html = re.sub(r'<GPE>','<span class="ner_gpe" data-entity="GPE">',html)
17
+ html = re.sub(r'<FAC>','<span class="ner_fac" data-entity="FAC">',html)
18
+ html = re.sub(r'<EVENT>','<span class="ner_event" data-entity="EVENT">',html)
19
+ html = re.sub(r'<DATE>','<span class="ner_date" data-entity="DATE">',html)
20
+ html = re.sub(r'<TIME>','<span class="ner_time" data-entity="TIME">',html)
21
+ html = re.sub(r'<CARDINAL>','<span class="ner_cardinal" data-entity="CARDINAL">',html)
22
+ html = re.sub(r'<ORDINAL>','<span class="ner_ordinal" data-entity="ORDINAL">',html)
23
+ html = re.sub(r'<PERCENT>','<span class="ner_percent" data-entity="PERCENT">',html)
24
+ html = re.sub(r'<QUANTITY>','<span class="ner_quantity" data-entity="QUANTITY">',html)
25
+ html = re.sub(r'<UNIT>','<span class="ner_unit" data-entity="UNIT">',html)
26
+ html = re.sub(r'<MONEY>','<span class="ner_money" data-entity="MONEY">',html)
27
+ html = re.sub(r'<CURR>','<span class="ner_currency" data-entity="CURRENCY">',html)
28
+ html = re.sub(r'<LANGUAGE>','<span class="ner_language" data-entity="LANGUAGE">',html)
29
+ html = re.sub(r'<PRODUCT>','<span class="ner_product" data-entity="PRODUCT">',html)
30
+ html = re.sub(r'<WORKOFART>','<span class="ner_work_of_art" data-entity="WORK_OF_ART">',html)
31
+ html = re.sub(r'<LAW>','<span class="ner_law" data-entity="LAW">',html)
32
+ return html