File size: 1,245 Bytes
876403c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()
morph_tagger = NewsMorphTagger(NewsEmbedding())
syntax_parser = NewsSyntaxParser(NewsEmbedding())

def text_preproc(df):
    patterns = [r'<com id=\d+"/>', r'<com id="\d+"/>', r'<\w+>', r'<\/\w+>', r'<\w+\s/>', '\*', '#', '\[', '\]']
    for pattern in patterns:
        df['text'] = df['text'].str.replace(pattern, '', regex=True)
    return df


def tokenizing(text):
    doc = Doc(text)
    doc.segment(segmenter)
    tokens = []
    for sent in doc.sents:
        sent = Doc(sent.text)
        sent.segment(segmenter)
        sent.parse_syntax(syntax_parser)
        sent.tag_morph(morph_tagger)
        for token in sent.tokens:
            token.lemmatize(morph_vocab)
        tokens.append(sent.tokens)
    return tokens


def get_all_lemmas(tokens):
    return [token.lemma for sent in tokens for token in sent]


def get_set_all_lemmas(tokens):
    return set(get_all_lemmas(tokens))


def get_set_sent_lemmas(sent_tokens):
    return set([token.lemma for token in sent_tokens])


def get_sent_tokens(sent_tokens):
    return [token for token in sent_tokens]