File size: 2,127 Bytes
4da642e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from parsinorm import General_normalization
import re


def get_ne_from_iob_output(sentences, tags_conf):
    sentences = sentences[0]
    tags = tags_conf[0][0]
    confs = tags_conf[1][0]

    seen_b = False
    keywords = {}
    new_token = []
    begin_index = 0
    for index, (tok, tag) in enumerate(zip(sentences, tags)):
        if tag[0] == 'I' and seen_b:
                new_token.append(tok)
        if tag[0] == 'B':
                if new_token:
                    keywords[' '.join(new_token)] =  confs[begin_index]
                new_token = []
                new_token.append(tok)
                begin_index = index
                seen_b = True
        if tag[0] == 'O':
             if new_token:
                keywords[' '.join(new_token)] =  confs[begin_index]
                new_token = []
                seen_b = False
    
    # print('keywords before sort: ', [k for k in keywords.keys])
    #sort
    sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True)
    print('keywords after sort: ', sorted_keywords)
    return sorted_keywords


def fuzzy_subword_match(key, words):
    for index, w in enumerate(words):
         if (len(key.split()) < len(w.split())) and key in w:
              return index
    return -1


#normalize
def normalize(txt):
  general_normalization = General_normalization()
  txt = general_normalization.alphabet_correction(txt)
  txt = general_normalization.semi_space_correction(txt)
  txt = general_normalization.english_correction(txt)
  txt = general_normalization.html_correction(txt)
  txt = general_normalization.arabic_correction(txt)
  txt = general_normalization.punctuation_correction(txt)
  txt = general_normalization.specials_chars(txt)
  txt = general_normalization.remove_emojis(txt)
  txt = general_normalization.number_correction(txt)
  txt = general_normalization.remove_not_desired_chars(txt)
  txt = general_normalization.remove_repeated_punctuation(txt)
  return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split())



def remove_puncs(txt):
     return re.sub('[!?،\(\)\.]','', txt)