File size: 2,558 Bytes
4da642e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from flair.data import Sentence
from flair.models import SequenceTagger
from NERDA.models import NERDA
from hazm import word_tokenize
import flair
import utils

class KPE:
    def __init__(self, trained_kpe_model, flair_ner_model, device='cpu') -> None:
        self.extractor_model = NERDA(
              tag_scheme = ['B-KEYWORD', 'I-KEYWORD'],
              tag_outside = 'O',
              transformer = 'xlm-roberta-large', 
              max_len=512,
              device=device)
        flair.device = device

        self.extractor_model.load_network_from_file(trained_kpe_model)
        self.ner_tagger = SequenceTagger.load(flair_ner_model)
        self.IGNORE_TAGS = {'ORDINAL', 'DATE', 'CARDINAL'}

    @staticmethod
    def combine_keywords_nes(init_keywords, nes):
        # init_keywords = list(set(init_keywords))
        nes = list(set(nes))
        print('nes before combined ', nes)
        combined_keywords = []
        for kw in init_keywords:
            matched_index = utils.fuzzy_subword_match(kw, nes)
            if matched_index != -1:
                print(kw, nes[matched_index])
                combined_keywords.append(nes[matched_index])
                del nes[matched_index]
            else:
                combined_keywords.append(kw)
        print('nes after combined ', nes)
        combined_keywords.extend([n for n in nes if n not in combined_keywords])
        return combined_keywords

    
    def extract(self, txt, using_ner=True):
        sentence = Sentence(txt)

        # predict NER tags
        if using_ner:
            self.ner_tagger.predict(sentence)
            nes = [entity.text for entity in sentence.get_spans('ner') if entity.tag not in self.IGNORE_TAGS]
        else:
            nes = []
        
        #remove puncs
        nes = list(map(utils.remove_puncs, nes))
        print('nes ', nes)
        sentences, tags_conf = self.extractor_model.predict_text(txt,  sent_tokenize=lambda txt: [txt], word_tokenize=lambda txt: txt.split(),    return_confidence=True)
        init_keywords = utils.get_ne_from_iob_output(sentences, tags_conf)
        init_keywords = list(map(utils.remove_puncs, init_keywords))
        print('init keywords : ', init_keywords)

        # combine ner response and init keywords
        merged_keywords = self.combine_keywords_nes(init_keywords, nes)
        
        #set but keep order
        final_keywords = []
        for kw in merged_keywords:
            if kw not in final_keywords:
                final_keywords.append(kw)
        return final_keywords