tokenizer-arena / vocab /gpt2 /test_fairseq_gpt2.py
eson's picture
update
751936e
raw
history blame
No virus
413 Bytes
from fairseq.data.encoders.gpt2_bpe import get_encoder
bpe = get_encoder('/workspace/fairseq-models/data/vocab/gpt2/encoder.json', '/workspace/fairseq-models/data/vocab/gpt2/vocab.bpe')
codes = bpe.encode('Hello world')
print(codes)
print(bpe.decode(codes))
test_str = 'Leonardo DiCaprio was born in Los Angeles'
print(bpe.bpe(test_str))
codes = bpe.encode(test_str)
print(codes)
print(bpe.decode(codes))