File size: 1,303 Bytes
4c883db
 
 
 
 
415c049
 
b5b6516
4c883db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import torch
import json
import regex as re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('russian'))

with open('models/datasets/vocab_to_int.json', 'r') as file:
    loaded_json = file.read()

vocab_to_int = json.loads(loaded_json)

list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']


def clean(text):

    text = text.lower()
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = ''.join([letter for letter in text if letter not in string.punctuation])
    text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ''.join([letter for letter in text if letter not in '…«»'])
    text = ' '.join([word for word in text.split() if word not in '    '])

    return text.strip()


def preprocess_lstm(text, MAX_LEN):

    cleaned_text = clean(text)

    text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)]
    padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int))

    return padded_text