from natasha import ( Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, NewsSyntaxParser, Doc ) segmenter = Segmenter() morph_vocab = MorphVocab() morph_tagger = NewsMorphTagger(NewsEmbedding()) syntax_parser = NewsSyntaxParser(NewsEmbedding()) def text_preproc(df): patterns = [r'', r'', r'<\w+>', r'<\/\w+>', r'<\w+\s/>', '\*', '#', '\[', '\]'] for pattern in patterns: df['text'] = df['text'].str.replace(pattern, '', regex=True) return df def tokenizing(text): doc = Doc(text) doc.segment(segmenter) tokens = [] for sent in doc.sents: sent = Doc(sent.text) sent.segment(segmenter) sent.parse_syntax(syntax_parser) sent.tag_morph(morph_tagger) for token in sent.tokens: token.lemmatize(morph_vocab) tokens.append(sent.tokens) return tokens def get_all_lemmas(tokens): return [token.lemma for sent in tokens for token in sent] def get_set_all_lemmas(tokens): return set(get_all_lemmas(tokens)) def get_set_sent_lemmas(sent_tokens): return set([token.lemma for token in sent_tokens]) def get_sent_tokens(sent_tokens): return [token for token in sent_tokens]