Spaces:
Sleeping
Sleeping
File size: 1,303 Bytes
4c883db 415c049 b5b6516 4c883db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import numpy as np
import torch
import json
import regex as re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
with open('models/datasets/vocab_to_int.json', 'r') as file:
loaded_json = file.read()
vocab_to_int = json.loads(loaded_json)
list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
def clean(text):
text = text.lower()
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'@\w+', ' ', text)
text = re.sub(r'#\w+', ' ', text)
text = re.sub(r'\d+', ' ', text)
text = ''.join([letter for letter in text if letter not in string.punctuation])
text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord])
text = ' '.join([word for word in text.split() if word not in stop_words])
text = ''.join([letter for letter in text if letter not in '…«»'])
text = ' '.join([word for word in text.split() if word not in ' '])
return text.strip()
def preprocess_lstm(text, MAX_LEN):
cleaned_text = clean(text)
text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)]
padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int))
return padded_text |