mrmft's picture
adding project source
4da642e
from parsinorm import General_normalization
import re
def get_ne_from_iob_output(sentences, tags_conf):
sentences = sentences[0]
tags = tags_conf[0][0]
confs = tags_conf[1][0]
seen_b = False
keywords = {}
new_token = []
begin_index = 0
for index, (tok, tag) in enumerate(zip(sentences, tags)):
if tag[0] == 'I' and seen_b:
new_token.append(tok)
if tag[0] == 'B':
if new_token:
keywords[' '.join(new_token)] = confs[begin_index]
new_token = []
new_token.append(tok)
begin_index = index
seen_b = True
if tag[0] == 'O':
if new_token:
keywords[' '.join(new_token)] = confs[begin_index]
new_token = []
seen_b = False
# print('keywords before sort: ', [k for k in keywords.keys])
#sort
sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True)
print('keywords after sort: ', sorted_keywords)
return sorted_keywords
def fuzzy_subword_match(key, words):
for index, w in enumerate(words):
if (len(key.split()) < len(w.split())) and key in w:
return index
return -1
#normalize
def normalize(txt):
general_normalization = General_normalization()
txt = general_normalization.alphabet_correction(txt)
txt = general_normalization.semi_space_correction(txt)
txt = general_normalization.english_correction(txt)
txt = general_normalization.html_correction(txt)
txt = general_normalization.arabic_correction(txt)
txt = general_normalization.punctuation_correction(txt)
txt = general_normalization.specials_chars(txt)
txt = general_normalization.remove_emojis(txt)
txt = general_normalization.number_correction(txt)
txt = general_normalization.remove_not_desired_chars(txt)
txt = general_normalization.remove_repeated_punctuation(txt)
return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split())
def remove_puncs(txt):
return re.sub('[!?ุŒ\(\)\.]','', txt)