Spaces:

SinaLab
/

relation-extraction-api

Runtime error

App Files Files Community

TymaaHammouda commited on 5 days ago

Commit

6d2b5d1

verified ·

1 Parent(s): 2551344

Upload 3 files

Browse files

Files changed (3) hide show

IBO_to_XML.py +135 -0
NER_Distiller.py +138 -0
XML_to_HTML.py +32 -0

IBO_to_XML.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# By Wasim Khatib
+# Version 2.0
+# This function take a list a set of annotated entities, in this format: [["صرح","O"],
+# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
+# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
+# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
+# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
+# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
+# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
+# start with ignore I- tags if they don’t have B-tags.
+import numpy as np
+def IBO_to_XML(temp):
+    xml_output = ""
+    temp_entities = sortTags(temp)
+    temp_list = list()
+    # initlize the temp_list
+    temp_list.append("")
+    word_position = 0
+    # For each entity, convert ibo to xml list.
+    for entity in temp_entities:
+        counter_tag = 0
+        # For each tag
+        for tag in str(entity[1]).split():
+            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
+            if counter_tag >= len(temp_list):
+                temp_list.append("")
+            # If the tag is equal O then and word position not equal zero then add all from templist to output ist
+            if "O" == tag and word_position != 0:
+                for j in range(len(temp_list),0,-1):
+                    if temp_list[j-1]!= "":
+                        xml_output+=" </"+str(temp_list[j-1])+">"
+                        temp_list[j-1] = ""
+            # if its not equal O and its correct tag like B-tag or I-tag and its B
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
+                # if the templist of counter tag is not empty then we need add xml word that contains
+                # </name of previous tag> its mean that we closed the tag in xml in xml_output
+                if temp_list[counter_tag] != "":
+                    xml_output+=" </"+str(temp_list[counter_tag])+">"
+                # After that we replace the previous tag from templist in new tag
+                temp_list[counter_tag] = str(tag).split("-")[1]
+                # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
+                xml_output += " <" + str(temp_list[counter_tag]) + ">"
+            # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
+                # we need to check if this tag like previous tag
+                for j in range(counter_tag,len(temp_list)):
+                    # if its equal then will break the loop and continue
+                    if temp_list[j] == tag[2:]:
+                        break
+                    # if not then we need to add xml word to close the tag like </name of previous> in xml_output
+                    else:
+                        if temp_list[j] != "":
+                            xml_output+=" </"+str(temp_list[j])+">"
+                            temp_list[j] = ""
+            counter_tag += 1
+        word_position += 1
+        # Add word in xml_output
+        xml_output +=" "+str(entity[0])
+    # Add all xml words in xml_output
+    for j in range(0, len(temp_list)):
+        if temp_list[j] != "":
+            xml_output+=" </"+str(temp_list[j])+">"
+    return xml_output.strip()
+def sortTags(entities):
+    temp_entities = entities
+    temp_counter = 0
+    # For each entity, this loop will sort each tag of entitiy, first it will check if the
+    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
+    for entity in temp_entities:
+        tags = entity[1].split()
+        for tag in tags:
+            # if the counter is not 0 then, will complete
+            if temp_counter != 0:
+                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
+                # count how many tag in previous tags
+                if "I-" == tag[0:2]:
+                    counter_of_this_tag = 0
+                    counter_of_previous_tag = 0
+                    for word in tags:
+                        if tag.split("-")[1] in word:
+                            counter_of_this_tag+=1
+                    for word in temp_entities[temp_counter-1][1].split():
+                        if tag.split("-")[1] in word:
+                            counter_of_previous_tag+=1
+                    # if the counter of previous tag is bigger than counter of this tag, then we
+                    # need to add I-tag in this tags
+                    if counter_of_previous_tag > counter_of_this_tag:
+                        tags.append("I-"+tag.split("-")[1])
+        # Sort the tags
+        tags.sort()
+        # Need to revers the tags because it should begins with I
+        tags.reverse()
+        # If the counter is not 0 then we can complete
+        if temp_counter != 0:
+            this_tags = tags
+            previous_tags = temp_entities[temp_counter - 1][1].split()
+            sorted_tags = list()
+            # Check if the this tag is not O and previous tags is not O, then will complete,
+            # if not then it will ignor this tag
+            if "O" not in this_tags and "O" not in previous_tags:
+                index = 0
+                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
+                for i in previous_tags:
+                    j = 0
+                    while this_tags and j < len(this_tags):
+                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
+                            sorted_tags.insert(index, this_tags.pop(j))
+                            break
+                        elif this_tags[j][0:2] == "B-":
+                            break
+                        j += 1
+                    index += 1
+            sorted_tags += this_tags
+            tags = sorted_tags
+        str_tag = " "
+        str_tag = str_tag.join(tags)
+        str_tag = str_tag.strip()
+        temp_entities[temp_counter][1] = str_tag
+        temp_counter += 1
+    return temp_entities

NER_Distiller.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# By Wasim Khatib
+# Version 2.0
+# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
+# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
+# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
+# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
+# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
+# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
+def distill_entities(entities):
+    # This is list that we put the output what we need
+    list_output = list()
+    # This line go to sort function and save the output to temp_entities
+    temp_entities = sortTags(entities)
+    # This list help us to make the output,
+    temp_list = list()
+    # initlize the temp_list
+    temp_list.append(["", "", 0, 0])
+    word_position = 0
+    # For each entity, convert ibo to distllir list.
+    for entity in temp_entities:
+        # This is counter tag of this entity
+        counter_tag = 0
+        # For each tag
+        for tag in str(entity[1]).split():
+            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
+            if counter_tag >= len(temp_list):
+                temp_list.append(["", "", 0, 0])
+            # If tag equal O and word postion of this tag is not equal zero then it will add all
+            # not empty eliment of temp list in output list
+            if "O" == tag and word_position != 0:
+                for j in range(0, len(temp_list)):
+                    if temp_list[j][1] != "":
+                        list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                        temp_list[j][0] = ""
+                        temp_list[j][1] = ""
+                        temp_list[j][2] = word_position
+                        temp_list[j][3] = word_position
+            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
+            # of the split its B
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
+                # if the temp_list of counter is not empty then it will append in output list and hten it will
+                # initilize by new string and tag in templist of counter
+                if temp_list[counter_tag][1] != "":
+                    list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
+                temp_list[counter_tag][0] = str(entity[0]) + " "
+                temp_list[counter_tag][1] = str(tag).split("-")[1]
+                temp_list[counter_tag][2] = word_position
+                temp_list[counter_tag][3] = word_position
+            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
+            # of the split its O
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
+                # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
+                # then will complete if not it will save in output list and cheak another
+                for j in range(counter_tag,len(temp_list)):
+                    if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
+                        temp_list[j][0] += str(entity[0]) + " "
+                        temp_list[j][3] += 1
+                        break
+                    else:
+                        if temp_list[j][1] != "":
+                            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                            temp_list[j][0] = ""
+                            temp_list[j][1] = ""
+                            temp_list[j][2] = word_position
+                            temp_list[j][3] = word_position
+            counter_tag += 1
+        word_position += 1
+    # For each temp_list, at the end of the previous loop, there will be some
+    # values in this list, we should save it to the output list
+    for j in range(0, len(temp_list)):
+        if temp_list[j][1] != "":
+            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+    return sorted(list_output, key=lambda x: (x[2]))
+def sortTags(entities):
+    temp_entities = entities
+    temp_counter = 0
+    # For each entity, this loop will sort each tag of entitiy, first it will check if the
+    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
+    for entity in temp_entities:
+        tags = entity[1].split()
+        for tag in tags:
+            # if the counter is not 0 then, will complete
+            if temp_counter != 0:
+                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
+                # count how many tag in previous tags
+                if "I-" == tag[0:2]:
+                    counter_of_this_tag = 0
+                    counter_of_previous_tag = 0
+                    for word in tags:
+                        if tag.split("-")[1] in word:
+                            counter_of_this_tag+=1
+                    for word in temp_entities[temp_counter-1][1].split():
+                        if tag.split("-")[1] in word:
+                            counter_of_previous_tag+=1
+                    # if the counter of previous tag is bigger than counter of this tag, then we
+                    # need to add I-tag in this tags
+                    if counter_of_previous_tag > counter_of_this_tag:
+                        tags.append("I-"+tag.split("-")[1])
+        # Sort the tags
+        tags.sort()
+        # Need to revers the tags because it should begins with I
+        tags.reverse()
+        # If the counter is not 0 then we can complete
+        if temp_counter != 0:
+            this_tags = tags
+            previous_tags = temp_entities[temp_counter - 1][1].split()
+            sorted_tags = list()
+            # Check if the this tag is not O and previous tags is not O, then will complete,
+            # if not then it will ignor this tag
+            if "O" not in this_tags and "O" not in previous_tags:
+                index = 0
+                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
+                for i in previous_tags:
+                    j = 0
+                    while this_tags and j < len(this_tags):
+                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
+                            sorted_tags.insert(index, this_tags.pop(j))
+                            break
+                        elif this_tags[j][0:2] == "B-":
+                            break
+                        j += 1
+                    index += 1
+            sorted_tags += this_tags
+            tags = sorted_tags
+        str_tag = " "
+        str_tag = str_tag.join(tags)
+        str_tag = str_tag.strip()
+        temp_entities[temp_counter][1] = str_tag
+        temp_counter += 1
+    return temp_entities

XML_to_HTML.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import re
+def NER_XML_to_HTML(xml):
+    html = re.sub(r'WORK_OF_ART','WORKOFART',xml)
+    # replace every end tag with end span tag "</span>"
+    html = re.sub(r'</[A-Z]+>','</span>',html)
+    # replace every start tag with the appropriate css class
+    html = re.sub(r'<PERS>','<span class="ner_pers" data-entity="PERS">',html)
+    html = re.sub(r'<GROUP>','<span class="ner_group" data-entity="NORP">',html)
+    html = re.sub(r'<OCC>','<span class="ner_occ" data-entity="OCC">',html)
+    html = re.sub(r'<ORG>','<span class="ner_org" data-entity="ORG">',html)
+    html = re.sub(r'<LOC>','<span class="ner_loc" data-entity="LOC">',html)
+    html = re.sub(r'<GPE>','<span class="ner_gpe" data-entity="GPE">',html)
+    html = re.sub(r'<FAC>','<span class="ner_fac" data-entity="FAC">',html)
+    html = re.sub(r'<EVENT>','<span class="ner_event" data-entity="EVENT">',html)
+    html = re.sub(r'<DATE>','<span class="ner_date" data-entity="DATE">',html)
+    html = re.sub(r'<TIME>','<span class="ner_time" data-entity="TIME">',html)
+    html = re.sub(r'<CARDINAL>','<span class="ner_cardinal" data-entity="CARDINAL">',html)
+    html = re.sub(r'<ORDINAL>','<span class="ner_ordinal" data-entity="ORDINAL">',html)
+    html = re.sub(r'<PERCENT>','<span class="ner_percent" data-entity="PERCENT">',html)
+    html = re.sub(r'<QUANTITY>','<span class="ner_quantity" data-entity="QUANTITY">',html)
+    html = re.sub(r'<UNIT>','<span class="ner_unit" data-entity="UNIT">',html)
+    html = re.sub(r'<MONEY>','<span class="ner_money" data-entity="MONEY">',html)
+    html = re.sub(r'<CURR>','<span class="ner_currency" data-entity="CURRENCY">',html)
+    html = re.sub(r'<LANGUAGE>','<span class="ner_language" data-entity="LANGUAGE">',html)
+    html = re.sub(r'<PRODUCT>','<span class="ner_product" data-entity="PRODUCT">',html)
+    html = re.sub(r'<WORKOFART>','<span class="ner_work_of_art" data-entity="WORK_OF_ART">',html)
+    html = re.sub(r'<LAW>','<span class="ner_law" data-entity="LAW">',html)
+    return html