Spaces:
Build error
Build error
File size: 1,520 Bytes
3d5e231 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# ------------------------------------------------------------------------------------
# Minimal DALL-E
# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------
import os
from functools import partial
from tokenizers import CharBPETokenizer
def build_tokenizer(path: str,
context_length: int = 64,
*args,
**kwargs):
try:
from_file = partial(CharBPETokenizer.from_file,
vocab_filename=os.path.join(path, 'bpe-16k-vocab.json'),
merges_filename=os.path.join(path, 'bpe-16k-merges.txt'),
unk_token='[UNK]')
tokenizer = from_file(*args, **kwargs)
except:
from_file = partial(CharBPETokenizer.from_file,
vocab_filename=os.path.join(path, 'vocab.json'),
merges_filename=os.path.join(path, 'merges.txt'),
unk_token='[UNK]')
tokenizer = from_file(*args, **kwargs)
# tokenizer = from_file(*args, **kwargs)
tokenizer.add_special_tokens(['[PAD]'])
tokenizer.enable_padding(length=context_length,
pad_id=tokenizer.token_to_id('[PAD]'))
tokenizer.enable_truncation(max_length=context_length)
print(f'{path} successfully restored..')
return tokenizer
|