Ruslan-DS commited on
Commit
4c883db
1 Parent(s): 70aee91

Update models/preprocess_stage/preprocess_lstm.py

Browse files
models/preprocess_stage/preprocess_lstm.py CHANGED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import json
4
+ import regex as re
5
+ import string
6
+ from nltk.corpus import stopwords
7
+
8
+
9
+ stop_words = set(stopwords.words('russian'))
10
+
11
+ with open('models/datasets/vocab_to_int.json', 'r') as file:
12
+ loaded_json = file.read()
13
+
14
+ vocab_to_int = json.loads(loaded_json)
15
+
16
+ list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
17
+
18
+
19
+ def clean(text):
20
+
21
+ text = text.lower()
22
+ text = re.sub(r'http\S+', " ", text)
23
+ text = re.sub(r'@\w+', ' ', text)
24
+ text = re.sub(r'#\w+', ' ', text)
25
+ text = re.sub(r'\d+', ' ', text)
26
+ text = ''.join([letter for letter in text if letter not in string.punctuation])
27
+ text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord])
28
+ text = ' '.join([word for word in text.split() if word not in stop_words])
29
+ text = ''.join([letter for letter in text if letter not in '…«»'])
30
+ text = ' '.join([word for word in text.split() if word not in ' '])
31
+
32
+ return text.strip()
33
+
34
+
35
+ def preprocess_lstm(text, MAX_LEN):
36
+
37
+ cleaned_text = clean(text)
38
+
39
+ text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)]
40
+ padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int))
41
+
42
+ return padded_text