Spaces:

Ruslan-DS
/

nlp-project

Sleeping

Ruslan-DS commited on Dec 16, 2023

Commit

4c883db

•

1 Parent(s): 70aee91

Update models/preprocess_stage/preprocess_lstm.py

Files changed (1) hide show

models/preprocess_stage/preprocess_lstm.py CHANGED Viewed

+import numpy as np
+import torch
+import json
+import regex as re
+import string
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+with open('models/datasets/vocab_to_int.json', 'r') as file:
+    loaded_json = file.read()
+vocab_to_int = json.loads(loaded_json)
+list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
+def clean(text):
+    text = text.lower()
+    text = re.sub(r'http\S+', " ", text)
+    text = re.sub(r'@\w+', ' ', text)
+    text = re.sub(r'#\w+', ' ', text)
+    text = re.sub(r'\d+', ' ', text)
+    text = ''.join([letter for letter in text if letter not in string.punctuation])
+    text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord])
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = ''.join([letter for letter in text if letter not in '…«»'])
+    text = ' '.join([word for word in text.split() if word not in '    '])
+    return text.strip()
+def preprocess_lstm(text, MAX_LEN):
+    cleaned_text = clean(text)
+    text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)]
+    padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int))
+    return padded_text