File size: 4,537 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72


import json
from transformers import AutoTokenizer, BloomTokenizerFast


# tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("moss-moon-003-sft", trust_remote_code=True)

print("vocab size:", tokenizer.vocab_size)


tokens = [  1639,    389,    281,   9552,   8796,   3025,   1438,    318,    337,
         18420,     13,    198,     12,    337,  18420,    318,    257,   3453,
           864,   3303,   2746,    326,    318,   4166,    416,    376,    463,
           272,   2059,     13,    632,    318,   3562,    284,    307,   7613,
            11,   5508,     11,    290,  23585,     13,    198,     12,    337,
         18420,    460,   1833,    290,  10996,   6562,   1473,    287,    262,
          3303,   7147,    416,    262,   2836,    884,    355,   3594,    290,
           220,  54119,     13,    337,  18420,    460,   1620,    597,   3303,
            12,   3106,   8861,     13,    198,     12,    337,  18420,   1276,
         11148,    284,   2112,   1997,   3519,    284,    663,  36454,     11,
          7729,     11,    393,   3173,     13,    198,     12,   6363,   9109,
          1276,    407,    307,  13443,     11,  10458,   2870,     11,  22066,
            11,   8381,     11,    572,     12,  26652,     11,    393,   6110,
            13,    198,     12,    632,    815,   3368,   3501,  19088,   9317,
           475,   8814,    319,   9432,   6419,    393,  20144,    588,    366,
           259,    428,   4732,    257,   1692,   1244,    910,   9313,     11,
           366,  11246,    661,   1244,    892,   9313,     11,   3503,     13,
           198,     12,   6363,   9109,   1276,    635,    307,   3967,     11,
         23507,     11,   3499,     11,  17774,     11,    290,  11932,     13,
           198,     12,    632,    460,   2148,   3224,   5981,   3307,    284,
          3280,    287,     12,  18053,    290,   8569,   2280,   9505,   4517,
          2480,   7612,     13,    198,     12,    632,   8453,   4340,    290,
         18178,    262,   2836,    338,  13052,    611,    262,   2836,   3376,
            82,    262,  11491,   3280,   7560,    416,    337,  18420,     13,
           198,  15610,   5738,    290,   4899,    326,    337,  18420,    460,
          8588,     13,    198,     27,     91,  20490,     91,  31175,  59163,
         50331,    220, 106067,    220,    198,     27,     91,     44,  18420,
            91,  31175,  10545,    224,    101,  50331,  50422,  52746,     44,
         18420,  50257,  52858,  50264,  58623,  55367,  51131,  50379,    220,
        106068,    198,     27,     91,  20490,     91,  31175,  10545,    236,
           101,  52047,  49390,  50428,  65292,  51916, 106067,    198,     27,
            91,     44,  18420,     91,  31175,  10263,    121,    241,  50368,
         50427,  50422,  62342,  49390,  50428,  51137,  66559,  65292,  51916,
         50313,    198,    198,     16,  64748,  14585,  60579,  80526,  54384,
         14585,     25,    317,   4687,  28032,  56866,  50614,  56456,  50573,
          9129,  51713,  50809,  67542,  63661,  50257,  69292,  52794,  50261,
         54740,  55061,  56164,  50257,  51206,  52427,  70255,  54261,  63632,
         50257,  50515,  56999,  72855,  52617,  55274,  16764,    198,    198,
            17,  64748,  51236,  53092,  61367,  54384,  47520,  21529,  56866,
         50614,  51700,  88026,   9129,  96919,  63661,  50257,  56723,  52427,
         52179,  77566,  50257,  52794,  50387,  52731,  86875,  53312,  52064,
         16764,    198,    198,     18,  64748,  62847,  56604,  54384,   8248,
          6176,  50394,  52189,  50313,  50614,  61283,   9129,  53459,  66122,
         63661,  50257,  56723,  52427,  79535,  72227,  40792,  50257,  51436,
         67464,  21410,  55794,  53312,  53340,  16764,    198,    198,     19,
         64748,  73713,  55794,  54384,    464,  24936,  56866,  50614,  50865,
         53701,  50285,  78675,   9129,  53850,  53534,  60431,  63661,  50257,
         56723,  52427,  55903,  51113,  97202,  51113,  53312,  57832,  16764,
           198,    198,     20,  64748,  92567,  54384,  44501,  56866,  50614,
         50363,  88026,   9129,  96919,  63661,  50257,  56723,  50890,  50810,
         96601,  56254,  50584,  56035,  57043,  58967,  66120,  54999,  50956,
         52707,  55409,  16764, 106068]
decode_line = tokenizer.decode(tokens)
print(decode_line)


for token in tokens:
    print(token, tokenizer.decode([token]))