Spaces:
Running
Running
import numpy as np | |
import torch | |
import json | |
import regex as re | |
import string | |
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('russian')) | |
with open('models/datasets/vocab_to_int.json', 'r') as file: | |
loaded_json = file.read() | |
vocab_to_int = json.loads(loaded_json) | |
list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'] | |
def clean(text): | |
text = text.lower() | |
text = re.sub(r'http\S+', " ", text) | |
text = re.sub(r'@\w+', ' ', text) | |
text = re.sub(r'#\w+', ' ', text) | |
text = re.sub(r'\d+', ' ', text) | |
text = ''.join([letter for letter in text if letter not in string.punctuation]) | |
text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord]) | |
text = ' '.join([word for word in text.split() if word not in stop_words]) | |
text = ''.join([letter for letter in text if letter not in '…«»']) | |
text = ' '.join([word for word in text.split() if word not in ' ']) | |
return text.strip() | |
def preprocess_lstm(text, MAX_LEN): | |
cleaned_text = clean(text) | |
text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)] | |
padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int)) | |
return padded_text |