badwords

Runtime error

File size: 7,958 Bytes

85e396d

from functools import lru_cache



@lru_cache()
def bytes_to_unicode_dict():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2 ** 8):
        if b not in bs:
            bs.append(b)
            cs.append(2 ** 8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(cs, bs))

ORD_UNICODE_MAP = bytes_to_unicode_dict()


@lru_cache()
def byte_to_char(bytestr):
    return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")

# @lru_cache()
def bytetokens_to_unicdode(byte_tokens: list):
    return [byte_to_char(token) for token in byte_tokens]


if __name__ == '__main__':
    
    tokens = ['<s>',
        'ì¹´ì¹´ìĺ¤',
        'ìĹĶ',
        'íĦ°',
        'íĶĦëĿ¼ìĿ´',
        'ì¦Ī',
        '(',
        'ëĮĢíĳľ',
        'Ġë°±',
        'ìĥģ',
        'ìĹ½',
        ')',
        'ê°Ģ',
        'Ġìĺ¬íķ´',
        'Ġ8',
        'ìĽĶ',
        'Ġê¸°ì¤Ģ',
        'Ġëĭ¤ìĪĺ',
        'Ġê¶Į',
        'ìľĦ',
        'ĠìŀĪëĬĶ',
        'Ġê¸Ģë¡ľë²Į',
        'ĠíķĻ',
        'íļĮìĹĲìĦľ',
        'Ġì´Ŀ',
        'Ġ16',
        'ê±´',
        'ìĿĺ',
        'ĠìĿ¸ê³µ',
        'ì§Ģ',
        'ëĬ¥',
        '(',
        'A',
        'I',
        ')',
        'Ġëħ¼ë¬¸',
        'ìĿĦ',
        'Ġëĵ±',
        'ìŀ¬',
        'íĸĪëĭ¤ê³ł',
        'Ġ9',
        'ìĿ¼',
        'Ġë°ĿíĺĶ',
        'ëĭ¤',
        '.',
        'Ġì§ĢëĤľíķ´',
        'Ġëĵ±',
        'ìŀ¬',
        'íķľ',
        'Ġ13',
        'ê±´ë',
        '³´ëĭ¤',
        'Ġ3',
        'ê±´',
        'Ġë§İìĿĢ',
        'Ġëħ¼ë¬¸',
        'ìĿ´',
        'Ġë°ĺ',
        'ëħĦ',
        'ìĹ¬',
        'Ġë§ĮìĹĲ',
        'Ġì±Ħ',
        'íĥĿ',
        'ëĲĲëĭ¤',
        '.',
        'Ġì¹´ì¹´ìĺ¤',
        'ìĹĶ',
        'íĦ°',
        'íĶĦëĿ¼ìĿ´',
        'ì¦Ī',
        '(',
        'ìĿ´',
        'íķĺ',
        'Ġì¹´ì¹´ìĺ¤',
        'ìĹĶ',
        'íĦ°',
        ')',
        'ëĬĶ',
        'ĠA',
        'I',
        'ĠìĹ°êµ¬',
        'ĠìĦ±',
        'ê³¼ë¥¼',
        'ĠìĿ´',
        'ìĸ´ê°Ģ',
        'ê¸°',
        'ĠìľĦíķ´',
        'ĠìĿ¸ìŀ¬',
        'ĠíĻķë³´',
        'ìĹĲ',
        'ĠìĨį',
        'ëıĦë¥¼',
        'ĠëĨĴìĿ´',
        'ê²łëĭ¤ëĬĶ',
        'Ġë°©',
        'ì¹¨',
        'ìĿ´ëĭ¤',
        '.',
        'Ċ',
        'Ċ',
        'ì¹´ì¹´ìĺ¤',
        'ìĹĶ',
        'íĦ°',
        'ëĬĶ',
        'Ġ8',
        'ìĽĶ',
        'ĠìŀĲìĹ°',
        'ìĸ´',
        'ì²ĺë¦¬',
        'Ġë¶Ħìķ¼',
        'ìĿĺ',
        'Ġê¸Ģë¡ľë²Į',
        'Ġíĥĳ',
        'ĠíķĻ',
        'íļĮ',
        'ìĿ¸',
        "Ġ'",
        'A',
        'C',
        'L',
        '-',
        'I',
        'J',
        'C',
        'N',
        'L',
        'P',
        "'",
        'ìĹĲ',
        'Ġëħ¼ë¬¸',
        'ìĿĦ',
        'Ġë°ľíĳľ',
        'íķľ',
        'ĠìĤ¬ë¡Ģ',
        'ê¹Įì§Ģ',
        'Ġíķ©',
        'íķ´',
        'Ġìĺ¬íķ´',
        'Ġì´Ŀ',
        'Ġ16',
        'ê±´',
        'ìĿĺ',
        'ĠA',
        'I',
        'Ġëħ¼ë¬¸',
        'ìĿĦ',
        'Ġëĵ±',
        'ìŀ¬',
        'íĸĪëĭ¤ê³ł',
        'Ġë°ĿíĺĶ',
        'ëĭ¤',
        '.',
        'ĠìĿ´',
        'Ġëħ¼ë¬¸',
        'ìĿĢ',
        'ĠìĿ¸ëıĦ',
        'ë©Ķ',
        'ìĿ¸',
        '(',
        'in',
        '-',
        'd',
        'om',
        'a',
        'in',
        ')',
        'Ġìĥĺ',
        'íĶĮ',
        'ìĿĦ',
        'ĠìĤ¬ìļ©',
        'íķ´',
        'ĠìŀĲìĹ°',
        'ìĸ´',
        'Ġê³µê²©',
        'Ġë°©ìĭĿìľ¼ë¡ľ',
        'ĠìķĦìĽĥ',
        'ìĺ¤',
        'ë¸Į',
        'ëıĦ',
        'ë©Ķ',
        'ìĿ¸',
        '(',
        'out',
        '-',
        'of',
        '-',
        'd',
        'om',
        'a',
        'in',
        ')',
        'Ġìĥĺ',
        'íĶĮ',
        'ìĿĦ',
        'ĠìŀĲëıĻ',
        'ìľ¼ë¡ľ',
        'ĠìĥĿ',
        'ìĦ±',
        ',',
        'Ġë¶Ħ',
        'ë¥ĺ',
        'Ġëª¨ëį¸',
        'ìĿĺ',
        'Ġê°Ĳ',
        'ì§Ģ',
        'ĠëĬ¥ëł¥ìĿĦ',
        'Ġíĸ¥',
        'ìĥģ',
        'ìĭľíĤ¤ëĬĶ',
        'ĠëĤ´ìļ©',
        'ìĿĺ',
        'Ġëħ¼ë¬¸',
        'ìĿ´ëĭ¤',
        '.',
        'Ċ',
        'Ċ',
        '7',
        'ìĽĶ',
        'ìĹĲëĬĶ',
        'Ġë¨¸',
        'ìĭł',
        'ëŁ¬',
        'ëĭĿ',
        'ĠíķĻ',
        'íļĮ',
        "Ġ'",
        'I',
        'C',
        'M',
        'L',
        "'",
        'ìĹĲ',
        'Ġíļ¨ìľ¨',
        'ìłģìĿ¸',
        'Ġê³ł',
        'íĴĪ',
        'ì§Ī',
        'ĠìĿĮ',
        'ìĦ±',
        'íķ©',
        'ìĦ±ìĿ´',
        'Ġê°ĢëĬ¥íķľ',
        "Ġ'",
        'ìĹĶ',
        'ëĵľ',
        'ĠíĪ¬',
        'ĠìĹĶ',
        'ëĵľ',
        '(',
        'en',
        'd',
        '-',
        't',
        'o',
        '-',
        'en',
        'd',
        ')',
        "'",
        'Ġëª¨ëį¸',
        'ìĿĦ',
        'ĠìłľìķĪ',
        'íķĺëĬĶ',
        'Ġëħ¼ë¬¸',
        'ìĿĦ',
        'Ġë°ľíĳľ',
        'íĸĪëĭ¤',
        '.',
        'Ġ6',
        'ìĽĶ',
        'ìĹĲëĬĶ',
        'ĠìĿĮ',
        'íĸ¥',
        'Â·',
        'ìĿĮ',
        'ìĦ±',
        'Ġìĭł',
        'íĺ¸',
        'ì²ĺë¦¬',
        'Ġë¶Ħìķ¼',
        'ĠíķĻ',
        'ìĪł',
        'ëĮĢíļĮ',
        "Ġ'",
        'I',
        'C',
        'A',
        'S',
        'S',
        'P',
        "'",
        'ìĹĲ',
        'ĠëĮĢ',
        'ê·ľëª¨',
        'Ġíħ',
        'į',
        'ìĬ¤íĬ¸',
        'Ġì½Ķ',
        'íį¼ìĬ¤',
        '(',
        'ìĸ¸',
        'ìĸ´',
        'ĠìĹ°',
        'êµ¬ë¥¼',
        'ĠìľĦíķ´',
        'Ġíħ',
        'į',
        'ìĬ¤íĬ¸ë¥¼',
        'Ġì»´íĵ¨íĦ°',
        'ê°Ģ',
        'ĠìĿ½ìĿĦ',
        'ĠìĪĺ',
        'ĠìŀĪëĬĶ',
        'Ġíĺķíĥľë¡ľ',
        'Ġëª¨ìķĦ',
        'ĠëĨĵìĿĢ',
        'Ġìĸ¸ìĸ´',
        'ĠìŀĲë£Į',
        ')',
        'Ġìłķë³´',
        'ĠíķĻìĬµ',
        'ìĹĲ',
        'ĠëĮĢíķľ',
        'Ġëħ¼ë¬¸',
        'Ġ1',
        'ê±´ìĿĦ',
        'Ġìĭ¤',
        'ìĹĪëĭ¤',
        '.',
        'Ċ',
        '</s>']

    import time

    start = time.time()
    for i in range(1000):
        result = bytetokens_to_unicdode(tokens)
    end = time.time()

    print(result)

    print(f'time: {end-start}')