Spaces:
Runtime error
Runtime error
| from parsinorm import General_normalization | |
| import re | |
| def get_ne_from_iob_output(sentences, tags_conf): | |
| sentences = sentences[0] | |
| tags = tags_conf[0][0] | |
| confs = tags_conf[1][0] | |
| seen_b = False | |
| keywords = {} | |
| new_token = [] | |
| begin_index = 0 | |
| for index, (tok, tag) in enumerate(zip(sentences, tags)): | |
| if tag[0] == 'I' and seen_b: | |
| new_token.append(tok) | |
| if tag[0] == 'B': | |
| if new_token: | |
| keywords[' '.join(new_token)] = confs[begin_index] | |
| new_token = [] | |
| new_token.append(tok) | |
| begin_index = index | |
| seen_b = True | |
| if tag[0] == 'O': | |
| if new_token: | |
| keywords[' '.join(new_token)] = confs[begin_index] | |
| new_token = [] | |
| seen_b = False | |
| # print('keywords before sort: ', [k for k in keywords.keys]) | |
| #sort | |
| sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True) | |
| print('keywords after sort: ', sorted_keywords) | |
| return sorted_keywords | |
| def fuzzy_subword_match(key, words): | |
| for index, w in enumerate(words): | |
| if (len(key.split()) < len(w.split())) and key in w: | |
| return index | |
| return -1 | |
| #normalize | |
| def normalize(txt): | |
| general_normalization = General_normalization() | |
| txt = general_normalization.alphabet_correction(txt) | |
| txt = general_normalization.semi_space_correction(txt) | |
| txt = general_normalization.english_correction(txt) | |
| txt = general_normalization.html_correction(txt) | |
| txt = general_normalization.arabic_correction(txt) | |
| txt = general_normalization.punctuation_correction(txt) | |
| txt = general_normalization.specials_chars(txt) | |
| txt = general_normalization.remove_emojis(txt) | |
| txt = general_normalization.number_correction(txt) | |
| txt = general_normalization.remove_not_desired_chars(txt) | |
| txt = general_normalization.remove_repeated_punctuation(txt) | |
| return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split()) | |
| def remove_puncs(txt): | |
| return re.sub('[!?ุ\(\)\.]','', txt) |