Spaces:
Sleeping
Sleeping
import torch | |
import torchtext | |
import re | |
def clean_text(text): | |
# Remove extra spaces | |
text = text.strip() | |
# Convert multiple spaces to single spaces | |
text = re.sub('\s+', ' ', text) | |
# Lowercase the text | |
text = text.lower() | |
# Remove punctuation marks | |
text = re.sub('[^\w\s]', '', text) | |
return text | |
def get_preprocess(vocab_path): | |
tokenizer = torchtext.data.utils.get_tokenizer('basic_english') | |
vocab = torch.load(vocab_path) | |
return lambda text: vocab(tokenizer(clean_text(text))) |