File size: 1,585 Bytes
fce29f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import MeCab
import os


def extract_keyphrase_candidates(text, tokenizer):
    tagger = MeCab.Tagger()
    tagger.parse("")

    t = [to.split('\t') for to in tagger.parse(text).split('\n') if to]
    t = [(to[0], to[1].split(',')[0]) for to in t if len(to) > 1]

    keyphrase_candidates = []
    phrase = []

    tokens = []
    idx = len(t) - 1
    start_pos = -1
    end_pos = -1
    cnt = 0
    phrase_set = set()

    while idx >= 0:
        while idx >= 0 and t[idx][1] != '名詞':
            tokens.append(t[idx][0])
            idx -= 1
        
        if idx >= 0 and t[idx][1] == '名詞':
            tokens.append(t[idx][0])
            end_pos = len(tokens)
            phrase.append(t[idx][0])
            idx -= 1

        while idx >= 0 and t[idx][1] == '名詞':
            tokens.append(t[idx][0])
            phrase.append(t[idx][0])
            idx -= 1

        while idx >= 0 and t[idx][1] == '形容詞':
            tokens.append(t[idx][0])
            phrase.append(t[idx][0])
            idx -= 1

        if len(phrase) > 1:
            start_pos = len(tokens)
            keyphrase_candidates.append(('_'.join(phrase[::-1]), (len(t) - start_pos, len(t) - end_pos)))

        phrase = []
        start_pos = -1
        end_pos = -1

    while idx >= 0:
        tokens.extend(tokenizer.tokenize(t[idx][0])[::-1])
        idx -= 1


    outputs = []
    for keyphrase in keyphrase_candidates[::-1]:
        if keyphrase[0] not in phrase_set:
            outputs.append(keyphrase)
            phrase_set.add(keyphrase[0])
    return tokens[::-1], outputs