from functools import lru_cache @lru_cache() def bytes_to_unicode_dict(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. """ bs = ( list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 for b in range(2 ** 8): if b not in bs: bs.append(b) cs.append(2 ** 8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(cs, bs)) ORD_UNICODE_MAP = bytes_to_unicode_dict() @lru_cache() def byte_to_char(bytestr): return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace") # @lru_cache() def bytetokens_to_unicdode(byte_tokens: list): return [byte_to_char(token) for token in byte_tokens] if __name__ == '__main__': tokens = ['', 'ì¹´ì¹´ìĺ¤', 'ìĹĶ', 'íĦ°', 'íĶĦëĿ¼ìĿ´', 'ì¦Ī', '(', 'ëĮĢíijľ', 'Ġë°±', 'ìĥģ', 'ìĹ½', ')', 'ê°Ģ', 'Ġìĺ¬íķ´', 'Ġ8', 'ìĽĶ', 'Ġ기ì¤Ģ', 'Ġëĭ¤ìĪĺ', 'Ġê¶Į', 'ìľĦ', 'ĠìŀĪëĬĶ', 'Ġê¸Ģë¡ľë²Į', 'ĠíķĻ', 'íļĮìĹIJìĦľ', 'Ġì´Ŀ', 'Ġ16', 'ê±´', 'ìĿĺ', 'ĠìĿ¸ê³µ', 'ì§Ģ', 'ëĬ¥', '(', 'A', 'I', ')', 'Ġëħ¼ë¬¸', 'ìĿĦ', 'Ġëĵ±', 'ìŀ¬', 'íĸĪëĭ¤ê³ł', 'Ġ9', 'ìĿ¼', 'Ġë°ĿíĺĶ', 'ëĭ¤', '.', 'Ġì§ĢëĤľíķ´', 'Ġëĵ±', 'ìŀ¬', 'íķľ', 'Ġ13', 'ê±´ë', '³´ëĭ¤', 'Ġ3', 'ê±´', 'Ġë§İìĿĢ', 'Ġëħ¼ë¬¸', 'ìĿ´', 'Ġë°ĺ', 'ëħĦ', 'ìŬ', 'Ġë§ĮìĹIJ', 'Ġì±Ħ', 'íĥĿ', 'ëIJIJëĭ¤', '.', 'Ġì¹´ì¹´ìĺ¤', 'ìĹĶ', 'íĦ°', 'íĶĦëĿ¼ìĿ´', 'ì¦Ī', '(', 'ìĿ´', 'íķĺ', 'Ġì¹´ì¹´ìĺ¤', 'ìĹĶ', 'íĦ°', ')', 'ëĬĶ', 'ĠA', 'I', 'ĠìĹ°êµ¬', 'ĠìĦ±', '과를', 'ĠìĿ´', 'ìĸ´ê°Ģ', '기', 'ĠìľĦíķ´', 'ĠìĿ¸ìŀ¬', 'ĠíĻķë³´', 'ìĹIJ', 'ĠìĨį', 'ëıĦ를', 'ĠëĨĴìĿ´', 'ê²łëĭ¤ëĬĶ', 'Ġë°©', '침', 'ìĿ´ëĭ¤', '.', 'Ċ', 'Ċ', 'ì¹´ì¹´ìĺ¤', 'ìĹĶ', 'íĦ°', 'ëĬĶ', 'Ġ8', 'ìĽĶ', 'ĠìŀIJìĹ°', 'ìĸ´', 'ì²ĺ리', 'Ġë¶Ħìķ¼', 'ìĿĺ', 'Ġê¸Ģë¡ľë²Į', 'Ġíĥij', 'ĠíķĻ', 'íļĮ', 'ìĿ¸', "Ġ'", 'A', 'C', 'L', '-', 'I', 'J', 'C', 'N', 'L', 'P', "'", 'ìĹIJ', 'Ġëħ¼ë¬¸', 'ìĿĦ', 'Ġë°ľíijľ', 'íķľ', 'ĠìĤ¬ë¡Ģ', 'ê¹Įì§Ģ', 'Ġíķ©', 'íķ´', 'Ġìĺ¬íķ´', 'Ġì´Ŀ', 'Ġ16', 'ê±´', 'ìĿĺ', 'ĠA', 'I', 'Ġëħ¼ë¬¸', 'ìĿĦ', 'Ġëĵ±', 'ìŀ¬', 'íĸĪëĭ¤ê³ł', 'Ġë°ĿíĺĶ', 'ëĭ¤', '.', 'ĠìĿ´', 'Ġëħ¼ë¬¸', 'ìĿĢ', 'ĠìĿ¸ëıĦ', 'ë©Ķ', 'ìĿ¸', '(', 'in', '-', 'd', 'om', 'a', 'in', ')', 'Ġìĥĺ', 'íĶĮ', 'ìĿĦ', 'ĠìĤ¬ìļ©', 'íķ´', 'ĠìŀIJìĹ°', 'ìĸ´', 'Ġ공격', 'Ġë°©ìĭĿìľ¼ë¡ľ', 'ĠìķĦìĽĥ', 'ìĺ¤', 'ë¸Į', 'ëıĦ', 'ë©Ķ', 'ìĿ¸', '(', 'out', '-', 'of', '-', 'd', 'om', 'a', 'in', ')', 'Ġìĥĺ', 'íĶĮ', 'ìĿĦ', 'ĠìŀIJëıĻ', 'ìľ¼ë¡ľ', 'ĠìĥĿ', 'ìĦ±', ',', 'Ġë¶Ħ', 'ë¥ĺ', 'Ġ모ëį¸', 'ìĿĺ', 'Ġê°IJ', 'ì§Ģ', 'ĠëĬ¥ëł¥ìĿĦ', 'Ġíĸ¥', 'ìĥģ', 'ìĭľíĤ¤ëĬĶ', 'ĠëĤ´ìļ©', 'ìĿĺ', 'Ġëħ¼ë¬¸', 'ìĿ´ëĭ¤', '.', 'Ċ', 'Ċ', '7', 'ìĽĶ', 'ìĹIJëĬĶ', 'Ġ머', 'ìĭł', '룬', 'ëĭĿ', 'ĠíķĻ', 'íļĮ', "Ġ'", 'I', 'C', 'M', 'L', "'", 'ìĹIJ', 'Ġíļ¨ìľ¨', 'ìłģìĿ¸', 'Ġê³ł', 'íĴĪ', 'ì§Ī', 'ĠìĿĮ', 'ìĦ±', 'íķ©', 'ìĦ±ìĿ´', 'Ġê°ĢëĬ¥íķľ', "Ġ'", 'ìĹĶ', 'ëĵľ', 'ĠíĪ¬', 'ĠìĹĶ', 'ëĵľ', '(', 'en', 'd', '-', 't', 'o', '-', 'en', 'd', ')', "'", 'Ġ모ëį¸', 'ìĿĦ', 'ĠìłľìķĪ', 'íķĺëĬĶ', 'Ġëħ¼ë¬¸', 'ìĿĦ', 'Ġë°ľíijľ', 'íĸĪëĭ¤', '.', 'Ġ6', 'ìĽĶ', 'ìĹIJëĬĶ', 'ĠìĿĮ', 'íĸ¥', '·', 'ìĿĮ', 'ìĦ±', 'Ġìĭł', 'íĺ¸', 'ì²ĺ리', 'Ġë¶Ħìķ¼', 'ĠíķĻ', 'ìĪł', 'ëĮĢíļĮ', "Ġ'", 'I', 'C', 'A', 'S', 'S', 'P', "'", 'ìĹIJ', 'ĠëĮĢ', 'ê·ľëª¨', 'Ġíħ', 'į', 'ìĬ¤íĬ¸', 'Ġì½Ķ', 'íį¼ìĬ¤', '(', 'ìĸ¸', 'ìĸ´', 'ĠìĹ°', '구를', 'ĠìľĦíķ´', 'Ġíħ', 'į', 'ìĬ¤íĬ¸ë¥¼', 'Ġì»´íĵ¨íĦ°', 'ê°Ģ', 'ĠìĿ½ìĿĦ', 'ĠìĪĺ', 'ĠìŀĪëĬĶ', 'Ġíĺķíĥľë¡ľ', 'Ġ모ìķĦ', 'ĠëĨĵìĿĢ', 'Ġìĸ¸ìĸ´', 'ĠìŀIJë£Į', ')', 'Ġìłķë³´', 'ĠíķĻìĬµ', 'ìĹIJ', 'ĠëĮĢíķľ', 'Ġëħ¼ë¬¸', 'Ġ1', 'ê±´ìĿĦ', 'Ġìĭ¤', 'ìĹĪëĭ¤', '.', 'Ċ', ''] import time start = time.time() for i in range(1000): result = bytetokens_to_unicdode(tokens) end = time.time() print(result) print(f'time: {end-start}')