from parsinorm import General_normalization import re def get_ne_from_iob_output(sentences, tags_conf): sentences = sentences[0] tags = tags_conf[0][0] confs = tags_conf[1][0] seen_b = False keywords = {} new_token = [] begin_index = 0 for index, (tok, tag) in enumerate(zip(sentences, tags)): if tag[0] == 'I' and seen_b: new_token.append(tok) if tag[0] == 'B': if new_token: keywords[' '.join(new_token)] = confs[begin_index] new_token = [] new_token.append(tok) begin_index = index seen_b = True if tag[0] == 'O': if new_token: keywords[' '.join(new_token)] = confs[begin_index] new_token = [] seen_b = False # print('keywords before sort: ', [k for k in keywords.keys]) #sort sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True) print('keywords after sort: ', sorted_keywords) return sorted_keywords def fuzzy_subword_match(key, words): for index, w in enumerate(words): if (len(key.split()) < len(w.split())) and key in w: return index return -1 #normalize def normalize(txt): general_normalization = General_normalization() txt = general_normalization.alphabet_correction(txt) txt = general_normalization.semi_space_correction(txt) txt = general_normalization.english_correction(txt) txt = general_normalization.html_correction(txt) txt = general_normalization.arabic_correction(txt) txt = general_normalization.punctuation_correction(txt) txt = general_normalization.specials_chars(txt) txt = general_normalization.remove_emojis(txt) txt = general_normalization.number_correction(txt) txt = general_normalization.remove_not_desired_chars(txt) txt = general_normalization.remove_repeated_punctuation(txt) return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) def remove_puncs(txt): return re.sub('[!?،\(\)\.]','', txt)