Spaces:
Runtime error
Runtime error
from parsinorm import General_normalization | |
import re | |
def get_ne_from_iob_output(sentences, tags_conf): | |
sentences = sentences[0] | |
tags = tags_conf[0][0] | |
confs = tags_conf[1][0] | |
seen_b = False | |
keywords = {} | |
new_token = [] | |
begin_index = 0 | |
for index, (tok, tag) in enumerate(zip(sentences, tags)): | |
if tag[0] == 'I' and seen_b: | |
new_token.append(tok) | |
if tag[0] == 'B': | |
if new_token: | |
keywords[' '.join(new_token)] = confs[begin_index] | |
new_token = [] | |
new_token.append(tok) | |
begin_index = index | |
seen_b = True | |
if tag[0] == 'O': | |
if new_token: | |
keywords[' '.join(new_token)] = confs[begin_index] | |
new_token = [] | |
seen_b = False | |
# print('keywords before sort: ', [k for k in keywords.keys]) | |
#sort | |
sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True) | |
print('keywords after sort: ', sorted_keywords) | |
return sorted_keywords | |
def fuzzy_subword_match(key, words): | |
for index, w in enumerate(words): | |
if (len(key.split()) < len(w.split())) and key in w: | |
return index | |
return -1 | |
#normalize | |
def normalize(txt): | |
general_normalization = General_normalization() | |
txt = general_normalization.alphabet_correction(txt) | |
txt = general_normalization.semi_space_correction(txt) | |
txt = general_normalization.english_correction(txt) | |
txt = general_normalization.html_correction(txt) | |
txt = general_normalization.arabic_correction(txt) | |
txt = general_normalization.punctuation_correction(txt) | |
txt = general_normalization.specials_chars(txt) | |
txt = general_normalization.remove_emojis(txt) | |
txt = general_normalization.number_correction(txt) | |
txt = general_normalization.remove_not_desired_chars(txt) | |
txt = general_normalization.remove_repeated_punctuation(txt) | |
return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) | |
def remove_puncs(txt): | |
return re.sub('[!?ุ\(\)\.]','', txt) |