import numpy as np import torch import json import regex as re import string import nltk nltk.download('stopwords') from nltk.corpus import stopwords stop_words = set(stopwords.words('russian')) with open('models/datasets/vocab_to_int.json', 'r') as file: loaded_json = file.read() vocab_to_int = json.loads(loaded_json) list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'] def clean(text): text = text.lower() text = re.sub(r'http\S+', " ", text) text = re.sub(r'@\w+', ' ', text) text = re.sub(r'#\w+', ' ', text) text = re.sub(r'\d+', ' ', text) text = ''.join([letter for letter in text if letter not in string.punctuation]) text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord]) text = ' '.join([word for word in text.split() if word not in stop_words]) text = ''.join([letter for letter in text if letter not in '…«»']) text = ' '.join([word for word in text.split() if word not in ' ']) return text.strip() def preprocess_lstm(text, MAX_LEN): cleaned_text = clean(text) text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)] padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int)) return padded_text