Spaces:
Running
Running
""" | |
special_symbols: https://github.com/google/sentencepiece/blob/master/doc/special_symbols.md | |
""" | |
import sys | |
# 来自 https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L55 | |
# 啥意思? | |
def bytes_to_unicode(): | |
""" | |
Returns list of utf-8 byte and a corresponding list of unicode strings. | |
The reversible bpe codes work on unicode strings. | |
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. | |
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. | |
This is a signficant percentage of your normal, say, 32K bpe vocab. | |
To avoid that, we want lookup tables between utf-8 bytes and unicode strings. | |
And avoids mapping to whitespace/control characters the bpe code barfs on. | |
""" | |
_chr = unichr if sys.version_info[0] == 2 else chr | |
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \ | |
list(range(ord("®"), ord("ÿ") + 1)) | |
cs = bs[:] | |
n = 0 | |
for b in range(2**8): | |
if b not in bs: | |
bs.append(b) | |
cs.append(2**8 + n) | |
n += 1 | |
cs = [_chr(n) for n in cs] | |
return dict(zip(bs, cs)) | |
aa = bytes_to_unicode() | |
print(aa) |