tokenizer-arena / vocab /moss /test_decode.py
eson's picture
update
751936e
raw
history blame
No virus
4.54 kB
import json
from transformers import AutoTokenizer, BloomTokenizerFast
# tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("moss-moon-003-sft", trust_remote_code=True)
print("vocab size:", tokenizer.vocab_size)
tokens = [ 1639, 389, 281, 9552, 8796, 3025, 1438, 318, 337,
18420, 13, 198, 12, 337, 18420, 318, 257, 3453,
864, 3303, 2746, 326, 318, 4166, 416, 376, 463,
272, 2059, 13, 632, 318, 3562, 284, 307, 7613,
11, 5508, 11, 290, 23585, 13, 198, 12, 337,
18420, 460, 1833, 290, 10996, 6562, 1473, 287, 262,
3303, 7147, 416, 262, 2836, 884, 355, 3594, 290,
220, 54119, 13, 337, 18420, 460, 1620, 597, 3303,
12, 3106, 8861, 13, 198, 12, 337, 18420, 1276,
11148, 284, 2112, 1997, 3519, 284, 663, 36454, 11,
7729, 11, 393, 3173, 13, 198, 12, 6363, 9109,
1276, 407, 307, 13443, 11, 10458, 2870, 11, 22066,
11, 8381, 11, 572, 12, 26652, 11, 393, 6110,
13, 198, 12, 632, 815, 3368, 3501, 19088, 9317,
475, 8814, 319, 9432, 6419, 393, 20144, 588, 366,
259, 428, 4732, 257, 1692, 1244, 910, 9313, 11,
366, 11246, 661, 1244, 892, 9313, 11, 3503, 13,
198, 12, 6363, 9109, 1276, 635, 307, 3967, 11,
23507, 11, 3499, 11, 17774, 11, 290, 11932, 13,
198, 12, 632, 460, 2148, 3224, 5981, 3307, 284,
3280, 287, 12, 18053, 290, 8569, 2280, 9505, 4517,
2480, 7612, 13, 198, 12, 632, 8453, 4340, 290,
18178, 262, 2836, 338, 13052, 611, 262, 2836, 3376,
82, 262, 11491, 3280, 7560, 416, 337, 18420, 13,
198, 15610, 5738, 290, 4899, 326, 337, 18420, 460,
8588, 13, 198, 27, 91, 20490, 91, 31175, 59163,
50331, 220, 106067, 220, 198, 27, 91, 44, 18420,
91, 31175, 10545, 224, 101, 50331, 50422, 52746, 44,
18420, 50257, 52858, 50264, 58623, 55367, 51131, 50379, 220,
106068, 198, 27, 91, 20490, 91, 31175, 10545, 236,
101, 52047, 49390, 50428, 65292, 51916, 106067, 198, 27,
91, 44, 18420, 91, 31175, 10263, 121, 241, 50368,
50427, 50422, 62342, 49390, 50428, 51137, 66559, 65292, 51916,
50313, 198, 198, 16, 64748, 14585, 60579, 80526, 54384,
14585, 25, 317, 4687, 28032, 56866, 50614, 56456, 50573,
9129, 51713, 50809, 67542, 63661, 50257, 69292, 52794, 50261,
54740, 55061, 56164, 50257, 51206, 52427, 70255, 54261, 63632,
50257, 50515, 56999, 72855, 52617, 55274, 16764, 198, 198,
17, 64748, 51236, 53092, 61367, 54384, 47520, 21529, 56866,
50614, 51700, 88026, 9129, 96919, 63661, 50257, 56723, 52427,
52179, 77566, 50257, 52794, 50387, 52731, 86875, 53312, 52064,
16764, 198, 198, 18, 64748, 62847, 56604, 54384, 8248,
6176, 50394, 52189, 50313, 50614, 61283, 9129, 53459, 66122,
63661, 50257, 56723, 52427, 79535, 72227, 40792, 50257, 51436,
67464, 21410, 55794, 53312, 53340, 16764, 198, 198, 19,
64748, 73713, 55794, 54384, 464, 24936, 56866, 50614, 50865,
53701, 50285, 78675, 9129, 53850, 53534, 60431, 63661, 50257,
56723, 52427, 55903, 51113, 97202, 51113, 53312, 57832, 16764,
198, 198, 20, 64748, 92567, 54384, 44501, 56866, 50614,
50363, 88026, 9129, 96919, 63661, 50257, 56723, 50890, 50810,
96601, 56254, 50584, 56035, 57043, 58967, 66120, 54999, 50956,
52707, 55409, 16764, 106068]
decode_line = tokenizer.decode(tokens)
print(decode_line)
for token in tokens:
print(token, tokenizer.decode([token]))