anthony.galtier
Added light code files
06a851e
raw history blame
No virus
878 Bytes
from transformers import CamembertTokenizer
def get_tokenizer(model_name='camembert-base'):
tokenizer = CamembertTokenizer.from_pretrained(model_name)
return tokenizer
def tokenize_encode_corpus(tokenizer, descriptions, max_len):
encoded_corpus = tokenizer(text=descriptions,
add_special_tokens=True,
padding='max_length',
truncation='longest_first',
max_length=max_len,
return_attention_mask=True)
return encoded_corpus
def extract_inputs_masks(encoded_corpus):
try:
input_ids = encoded_corpus['input_ids']
attention_mask = encoded_corpus['attention_mask']
except:
print('Available keys are = ', encoded_corpus.keys())
return None
return input_ids, attention_mask