Spaces:

eubinecto
/

idiomify

Runtime error

idiomify / explore /explore_bart_tokenizer_add_special_tokens.py

a couple of explore scripts for: adding two special tokens (<idiom>, </idiom>)

927768a over 2 years ago

No virus

991 Bytes

	from transformers import BartTokenizer, BartForConditionalGeneration


	def main():
	tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
	bart = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
	num_added_tokens = tokenizer.add_special_tokens({
	"additional_special_tokens": ["<idiom>", "</idiom>"], # beginning and end of an idiom
	})
	print(num_added_tokens)
	print(tokenizer.additional_special_tokens) # more special tokens are added here
	# and then you should resize the embedding table of your model
	print(bart.model.shared.weight.shape) # before
	bart.resize_token_embeddings(len(tokenizer))
	print(bart.model.shared.weight.shape) # after


	if __name__ == '__main__':
	main()

	"""
	2
	['<idiom>', '</idiom>']
	torch.Size([50265, 768])
	torch.Size([50267, 768]) # you can see that 2 more embedding vectors have been added here.
	later, you may want to save the tokenizer after you add the idiom special tokens.
	"""