Spaces:
Running
Running
import numpy as np | |
import torch | |
from transformers import BertModel, BertTokenizer | |
CHECKPOINT = 'DeepPavlov/rubert-base-cased' | |
tokenizer = BertTokenizer.from_pretrained(CHECKPOINT) | |
model = BertModel.from_pretrained(CHECKPOINT) | |
def preprocess_bert(text, MAX_LEN): | |
tokenized_text = tokenizer.encode( | |
text=text, | |
add_special_tokens=True, | |
truncation=True, | |
max_length=MAX_LEN | |
) | |
padded_text = np.array(tokenized_text + [0] * (MAX_LEN - len(tokenized_text))) | |
attention_mask = np.where(padded_text != 0, 1, 0) | |
return padded_text, attention_mask |