|
from functools import lru_cache |
|
|
|
|
|
|
|
@lru_cache() |
|
def bytes_to_unicode_dict(): |
|
""" |
|
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control |
|
characters the bpe code barfs on. |
|
|
|
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab |
|
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for |
|
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup |
|
tables between utf-8 bytes and unicode strings. |
|
""" |
|
bs = ( |
|
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) |
|
) |
|
cs = bs[:] |
|
n = 0 |
|
for b in range(2 ** 8): |
|
if b not in bs: |
|
bs.append(b) |
|
cs.append(2 ** 8 + n) |
|
n += 1 |
|
cs = [chr(n) for n in cs] |
|
return dict(zip(cs, bs)) |
|
|
|
ORD_UNICODE_MAP = bytes_to_unicode_dict() |
|
|
|
|
|
@lru_cache() |
|
def byte_to_char(bytestr): |
|
return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace") |
|
|
|
|
|
def bytetokens_to_unicdode(byte_tokens: list): |
|
return [byte_to_char(token) for token in byte_tokens] |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
tokens = ['<s>', |
|
'ì¹´ì¹´ìĺ¤', |
|
'ìĹĶ', |
|
'íĦ°', |
|
'íĶĦëĿ¼ìĿ´', |
|
'ì¦Ī', |
|
'(', |
|
'ëĮĢíijľ', |
|
'Ġë°±', |
|
'ìĥģ', |
|
'ìĹ½', |
|
')', |
|
'ê°Ģ', |
|
'Ġìĺ¬íķ´', |
|
'Ġ8', |
|
'ìĽĶ', |
|
'Ġ기ì¤Ģ', |
|
'Ġëĭ¤ìĪĺ', |
|
'Ġê¶Į', |
|
'ìľĦ', |
|
'ĠìŀĪëĬĶ', |
|
'Ġê¸Ģë¡ľë²Į', |
|
'ĠíķĻ', |
|
'íļĮìĹIJìĦľ', |
|
'Ġì´Ŀ', |
|
'Ġ16', |
|
'ê±´', |
|
'ìĿĺ', |
|
'ĠìĿ¸ê³µ', |
|
'ì§Ģ', |
|
'ëĬ¥', |
|
'(', |
|
'A', |
|
'I', |
|
')', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿĦ', |
|
'Ġëĵ±', |
|
'ìŀ¬', |
|
'íĸĪëĭ¤ê³ł', |
|
'Ġ9', |
|
'ìĿ¼', |
|
'Ġë°ĿíĺĶ', |
|
'ëĭ¤', |
|
'.', |
|
'Ġì§ĢëĤľíķ´', |
|
'Ġëĵ±', |
|
'ìŀ¬', |
|
'íķľ', |
|
'Ġ13', |
|
'ê±´ë', |
|
'³´ëĭ¤', |
|
'Ġ3', |
|
'ê±´', |
|
'Ġë§İìĿĢ', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿ´', |
|
'Ġë°ĺ', |
|
'ëħĦ', |
|
'ìŬ', |
|
'Ġë§ĮìĹIJ', |
|
'Ġì±Ħ', |
|
'íĥĿ', |
|
'ëIJIJëĭ¤', |
|
'.', |
|
'Ġì¹´ì¹´ìĺ¤', |
|
'ìĹĶ', |
|
'íĦ°', |
|
'íĶĦëĿ¼ìĿ´', |
|
'ì¦Ī', |
|
'(', |
|
'ìĿ´', |
|
'íķĺ', |
|
'Ġì¹´ì¹´ìĺ¤', |
|
'ìĹĶ', |
|
'íĦ°', |
|
')', |
|
'ëĬĶ', |
|
'ĠA', |
|
'I', |
|
'ĠìĹ°êµ¬', |
|
'ĠìĦ±', |
|
'과를', |
|
'ĠìĿ´', |
|
'ìĸ´ê°Ģ', |
|
'기', |
|
'ĠìľĦíķ´', |
|
'ĠìĿ¸ìŀ¬', |
|
'ĠíĻķë³´', |
|
'ìĹIJ', |
|
'ĠìĨį', |
|
'ëıĦ를', |
|
'ĠëĨĴìĿ´', |
|
'ê²łëĭ¤ëĬĶ', |
|
'Ġë°©', |
|
'침', |
|
'ìĿ´ëĭ¤', |
|
'.', |
|
'Ċ', |
|
'Ċ', |
|
'ì¹´ì¹´ìĺ¤', |
|
'ìĹĶ', |
|
'íĦ°', |
|
'ëĬĶ', |
|
'Ġ8', |
|
'ìĽĶ', |
|
'ĠìŀIJìĹ°', |
|
'ìĸ´', |
|
'ì²ĺ리', |
|
'Ġë¶Ħìķ¼', |
|
'ìĿĺ', |
|
'Ġê¸Ģë¡ľë²Į', |
|
'Ġíĥij', |
|
'ĠíķĻ', |
|
'íļĮ', |
|
'ìĿ¸', |
|
"Ġ'", |
|
'A', |
|
'C', |
|
'L', |
|
'-', |
|
'I', |
|
'J', |
|
'C', |
|
'N', |
|
'L', |
|
'P', |
|
"'", |
|
'ìĹIJ', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿĦ', |
|
'Ġë°ľíijľ', |
|
'íķľ', |
|
'ĠìĤ¬ë¡Ģ', |
|
'ê¹Įì§Ģ', |
|
'Ġíķ©', |
|
'íķ´', |
|
'Ġìĺ¬íķ´', |
|
'Ġì´Ŀ', |
|
'Ġ16', |
|
'ê±´', |
|
'ìĿĺ', |
|
'ĠA', |
|
'I', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿĦ', |
|
'Ġëĵ±', |
|
'ìŀ¬', |
|
'íĸĪëĭ¤ê³ł', |
|
'Ġë°ĿíĺĶ', |
|
'ëĭ¤', |
|
'.', |
|
'ĠìĿ´', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿĢ', |
|
'ĠìĿ¸ëıĦ', |
|
'ë©Ķ', |
|
'ìĿ¸', |
|
'(', |
|
'in', |
|
'-', |
|
'd', |
|
'om', |
|
'a', |
|
'in', |
|
')', |
|
'Ġìĥĺ', |
|
'íĶĮ', |
|
'ìĿĦ', |
|
'ĠìĤ¬ìļ©', |
|
'íķ´', |
|
'ĠìŀIJìĹ°', |
|
'ìĸ´', |
|
'Ġ공격', |
|
'Ġë°©ìĭĿìľ¼ë¡ľ', |
|
'ĠìķĦìĽĥ', |
|
'ìĺ¤', |
|
'ë¸Į', |
|
'ëıĦ', |
|
'ë©Ķ', |
|
'ìĿ¸', |
|
'(', |
|
'out', |
|
'-', |
|
'of', |
|
'-', |
|
'd', |
|
'om', |
|
'a', |
|
'in', |
|
')', |
|
'Ġìĥĺ', |
|
'íĶĮ', |
|
'ìĿĦ', |
|
'ĠìŀIJëıĻ', |
|
'ìľ¼ë¡ľ', |
|
'ĠìĥĿ', |
|
'ìĦ±', |
|
',', |
|
'Ġë¶Ħ', |
|
'ë¥ĺ', |
|
'Ġ모ëį¸', |
|
'ìĿĺ', |
|
'Ġê°IJ', |
|
'ì§Ģ', |
|
'ĠëĬ¥ëł¥ìĿĦ', |
|
'Ġíĸ¥', |
|
'ìĥģ', |
|
'ìĭľíĤ¤ëĬĶ', |
|
'ĠëĤ´ìļ©', |
|
'ìĿĺ', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿ´ëĭ¤', |
|
'.', |
|
'Ċ', |
|
'Ċ', |
|
'7', |
|
'ìĽĶ', |
|
'ìĹIJëĬĶ', |
|
'Ġ머', |
|
'ìĭł', |
|
'룬', |
|
'ëĭĿ', |
|
'ĠíķĻ', |
|
'íļĮ', |
|
"Ġ'", |
|
'I', |
|
'C', |
|
'M', |
|
'L', |
|
"'", |
|
'ìĹIJ', |
|
'Ġíļ¨ìľ¨', |
|
'ìłģìĿ¸', |
|
'Ġê³ł', |
|
'íĴĪ', |
|
'ì§Ī', |
|
'ĠìĿĮ', |
|
'ìĦ±', |
|
'íķ©', |
|
'ìĦ±ìĿ´', |
|
'Ġê°ĢëĬ¥íķľ', |
|
"Ġ'", |
|
'ìĹĶ', |
|
'ëĵľ', |
|
'ĠíĪ¬', |
|
'ĠìĹĶ', |
|
'ëĵľ', |
|
'(', |
|
'en', |
|
'd', |
|
'-', |
|
't', |
|
'o', |
|
'-', |
|
'en', |
|
'd', |
|
')', |
|
"'", |
|
'Ġ모ëį¸', |
|
'ìĿĦ', |
|
'ĠìłľìķĪ', |
|
'íķĺëĬĶ', |
|
'Ġëħ¼ë¬¸', |
|
'ìĿĦ', |
|
'Ġë°ľíijľ', |
|
'íĸĪëĭ¤', |
|
'.', |
|
'Ġ6', |
|
'ìĽĶ', |
|
'ìĹIJëĬĶ', |
|
'ĠìĿĮ', |
|
'íĸ¥', |
|
'·', |
|
'ìĿĮ', |
|
'ìĦ±', |
|
'Ġìĭł', |
|
'íĺ¸', |
|
'ì²ĺ리', |
|
'Ġë¶Ħìķ¼', |
|
'ĠíķĻ', |
|
'ìĪł', |
|
'ëĮĢíļĮ', |
|
"Ġ'", |
|
'I', |
|
'C', |
|
'A', |
|
'S', |
|
'S', |
|
'P', |
|
"'", |
|
'ìĹIJ', |
|
'ĠëĮĢ', |
|
'ê·ľëª¨', |
|
'Ġíħ', |
|
'į', |
|
'ìĬ¤íĬ¸', |
|
'Ġì½Ķ', |
|
'íį¼ìĬ¤', |
|
'(', |
|
'ìĸ¸', |
|
'ìĸ´', |
|
'ĠìĹ°', |
|
'구를', |
|
'ĠìľĦíķ´', |
|
'Ġíħ', |
|
'į', |
|
'ìĬ¤íĬ¸ë¥¼', |
|
'Ġì»´íĵ¨íĦ°', |
|
'ê°Ģ', |
|
'ĠìĿ½ìĿĦ', |
|
'ĠìĪĺ', |
|
'ĠìŀĪëĬĶ', |
|
'Ġíĺķíĥľë¡ľ', |
|
'Ġ모ìķĦ', |
|
'ĠëĨĵìĿĢ', |
|
'Ġìĸ¸ìĸ´', |
|
'ĠìŀIJë£Į', |
|
')', |
|
'Ġìłķë³´', |
|
'ĠíķĻìĬµ', |
|
'ìĹIJ', |
|
'ĠëĮĢíķľ', |
|
'Ġëħ¼ë¬¸', |
|
'Ġ1', |
|
'ê±´ìĿĦ', |
|
'Ġìĭ¤', |
|
'ìĹĪëĭ¤', |
|
'.', |
|
'Ċ', |
|
'</s>'] |
|
|
|
import time |
|
|
|
start = time.time() |
|
for i in range(1000): |
|
result = bytetokens_to_unicdode(tokens) |
|
end = time.time() |
|
|
|
print(result) |
|
|
|
print(f'time: {end-start}') |