Spaces:

Ruslan-DS
/

nlp-project

Running

Update models/preprocess_stage/preprocess_lstm.py

b5b6516 6 months ago

No virus

1.3 kB

	import numpy as np
	import torch
	import json
	import regex as re
	import string
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords

	stop_words = set(stopwords.words('russian'))

	with open('models/datasets/vocab_to_int.json', 'r') as file:
	loaded_json = file.read()

	vocab_to_int = json.loads(loaded_json)

	list_eng_ord = [ord(eng_letter.lower()) for eng_letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']


	def clean(text):

	text = text.lower()
	text = re.sub(r'http\S+', " ", text)
	text = re.sub(r'@\w+', ' ', text)
	text = re.sub(r'#\w+', ' ', text)
	text = re.sub(r'\d+', ' ', text)
	text = ''.join([letter for letter in text if letter not in string.punctuation])
	text = ''.join([letter for letter in text if ord(letter.lower()) not in list_eng_ord])
	text = ' '.join([word for word in text.split() if word not in stop_words])
	text = ''.join([letter for letter in text if letter not in '…«»'])
	text = ' '.join([word for word in text.split() if word not in ' '])

	return text.strip()


	def preprocess_lstm(text, MAX_LEN):

	cleaned_text = clean(text)

	text_to_int = [vocab_to_int[word] for word in cleaned_text.split() if vocab_to_int.get(word)]
	padded_text = text_to_int + [0] * (MAX_LEN - len(text_to_int))

	return padded_text