File size: 1,520 Bytes
3d5e231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# ------------------------------------------------------------------------------------
# Minimal DALL-E
# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------

import os
from functools import partial
from tokenizers import CharBPETokenizer


def build_tokenizer(path: str,
                    context_length: int = 64,
                    *args,
                    **kwargs):
    try:
        from_file = partial(CharBPETokenizer.from_file,
                            vocab_filename=os.path.join(path, 'bpe-16k-vocab.json'),
                            merges_filename=os.path.join(path, 'bpe-16k-merges.txt'),
                            unk_token='[UNK]')
        tokenizer = from_file(*args, **kwargs)
    except:
        from_file = partial(CharBPETokenizer.from_file,
                            vocab_filename=os.path.join(path, 'vocab.json'),
                            merges_filename=os.path.join(path, 'merges.txt'),
                            unk_token='[UNK]')
        tokenizer = from_file(*args, **kwargs)

    # tokenizer = from_file(*args, **kwargs)
    tokenizer.add_special_tokens(['[PAD]'])
    tokenizer.enable_padding(length=context_length,
                             pad_id=tokenizer.token_to_id('[PAD]'))
    tokenizer.enable_truncation(max_length=context_length)
    print(f'{path} successfully restored..')
    return tokenizer