nlp-project / models /preprocess_stage /preprocess_lstm.py
Ruslan-DS's picture
Update models/preprocess_stage/preprocess_lstm.py
b5b6516
import numpy as np
import torch
import json
import regex as re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
with open('models/datasets/vocab_to_int.json', 'r') as file:
loaded_json = file.read()
vocab_to_int = json.loads(loaded_json)
list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
def clean(text):
text = text.lower()
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'@\w+', ' ', text)
text = re.sub(r'#\w+', ' ', text)
text = re.sub(r'\d+', ' ', text)
text = ''.join([letter for letter in text if letter not in string.punctuation])
text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord])
text = ' '.join([word for word in text.split() if word not in stop_words])
text = ''.join([letter for letter in text if letter not in '…«»'])
text = ' '.join([word for word in text.split() if word not in ' '])
return text.strip()
def preprocess_lstm(text, MAX_LEN):
cleaned_text = clean(text)
text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)]
padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int))
return padded_text