Spaces:

fukatani
/

keyphrase-extraction-ja

Runtime error

File size: 1,585 Bytes

fce29f6

import MeCab
import os


def extract_keyphrase_candidates(text, tokenizer):
    tagger = MeCab.Tagger()
    tagger.parse("")

    t = [to.split('\t') for to in tagger.parse(text).split('\n') if to]
    t = [(to[0], to[1].split(',')[0]) for to in t if len(to) > 1]

    keyphrase_candidates = []
    phrase = []

    tokens = []
    idx = len(t) - 1
    start_pos = -1
    end_pos = -1
    cnt = 0
    phrase_set = set()

    while idx >= 0:
        while idx >= 0 and t[idx][1] != '名詞':
            tokens.append(t[idx][0])
            idx -= 1
        
        if idx >= 0 and t[idx][1] == '名詞':
            tokens.append(t[idx][0])
            end_pos = len(tokens)
            phrase.append(t[idx][0])
            idx -= 1

        while idx >= 0 and t[idx][1] == '名詞':
            tokens.append(t[idx][0])
            phrase.append(t[idx][0])
            idx -= 1

        while idx >= 0 and t[idx][1] == '形容詞':
            tokens.append(t[idx][0])
            phrase.append(t[idx][0])
            idx -= 1

        if len(phrase) > 1:
            start_pos = len(tokens)
            keyphrase_candidates.append(('_'.join(phrase[::-1]), (len(t) - start_pos, len(t) - end_pos)))

        phrase = []
        start_pos = -1
        end_pos = -1

    while idx >= 0:
        tokens.extend(tokenizer.tokenize(t[idx][0])[::-1])
        idx -= 1


    outputs = []
    for keyphrase in keyphrase_candidates[::-1]:
        if keyphrase[0] not in phrase_set:
            outputs.append(keyphrase)
            phrase_set.add(keyphrase[0])
    return tokens[::-1], outputs