File size: 777 Bytes
cb2adb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
import string

import nltk
import pymorphy2
from nltk.tokenize import word_tokenize

nltk.download("punkt")


def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
    text = re.sub(r"\d+\w*", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text


def lemmize_and_tokenize_text(text: str) -> list[str]:
    morph = pymorphy2.MorphAnalyzer()
    tokens = word_tokenize(text)
    lemmas = [morph.parse(token)[0].normal_form for token in tokens]
    return lemmas


def data_preprocessing(text: str) -> list[str]:
    cleaned_text = clean_text(text)
    lemmized_text = lemmize_and_tokenize_text(cleaned_text)
    return lemmized_text