import json from transformers import AutoTokenizer, BloomTokenizerFast # tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("moss-moon-003-sft", trust_remote_code=True) print("vocab size:", tokenizer.vocab_size) tokens = [ 1639, 389, 281, 9552, 8796, 3025, 1438, 318, 337, 18420, 13, 198, 12, 337, 18420, 318, 257, 3453, 864, 3303, 2746, 326, 318, 4166, 416, 376, 463, 272, 2059, 13, 632, 318, 3562, 284, 307, 7613, 11, 5508, 11, 290, 23585, 13, 198, 12, 337, 18420, 460, 1833, 290, 10996, 6562, 1473, 287, 262, 3303, 7147, 416, 262, 2836, 884, 355, 3594, 290, 220, 54119, 13, 337, 18420, 460, 1620, 597, 3303, 12, 3106, 8861, 13, 198, 12, 337, 18420, 1276, 11148, 284, 2112, 1997, 3519, 284, 663, 36454, 11, 7729, 11, 393, 3173, 13, 198, 12, 6363, 9109, 1276, 407, 307, 13443, 11, 10458, 2870, 11, 22066, 11, 8381, 11, 572, 12, 26652, 11, 393, 6110, 13, 198, 12, 632, 815, 3368, 3501, 19088, 9317, 475, 8814, 319, 9432, 6419, 393, 20144, 588, 366, 259, 428, 4732, 257, 1692, 1244, 910, 9313, 11, 366, 11246, 661, 1244, 892, 9313, 11, 3503, 13, 198, 12, 6363, 9109, 1276, 635, 307, 3967, 11, 23507, 11, 3499, 11, 17774, 11, 290, 11932, 13, 198, 12, 632, 460, 2148, 3224, 5981, 3307, 284, 3280, 287, 12, 18053, 290, 8569, 2280, 9505, 4517, 2480, 7612, 13, 198, 12, 632, 8453, 4340, 290, 18178, 262, 2836, 338, 13052, 611, 262, 2836, 3376, 82, 262, 11491, 3280, 7560, 416, 337, 18420, 13, 198, 15610, 5738, 290, 4899, 326, 337, 18420, 460, 8588, 13, 198, 27, 91, 20490, 91, 31175, 59163, 50331, 220, 106067, 220, 198, 27, 91, 44, 18420, 91, 31175, 10545, 224, 101, 50331, 50422, 52746, 44, 18420, 50257, 52858, 50264, 58623, 55367, 51131, 50379, 220, 106068, 198, 27, 91, 20490, 91, 31175, 10545, 236, 101, 52047, 49390, 50428, 65292, 51916, 106067, 198, 27, 91, 44, 18420, 91, 31175, 10263, 121, 241, 50368, 50427, 50422, 62342, 49390, 50428, 51137, 66559, 65292, 51916, 50313, 198, 198, 16, 64748, 14585, 60579, 80526, 54384, 14585, 25, 317, 4687, 28032, 56866, 50614, 56456, 50573, 9129, 51713, 50809, 67542, 63661, 50257, 69292, 52794, 50261, 54740, 55061, 56164, 50257, 51206, 52427, 70255, 54261, 63632, 50257, 50515, 56999, 72855, 52617, 55274, 16764, 198, 198, 17, 64748, 51236, 53092, 61367, 54384, 47520, 21529, 56866, 50614, 51700, 88026, 9129, 96919, 63661, 50257, 56723, 52427, 52179, 77566, 50257, 52794, 50387, 52731, 86875, 53312, 52064, 16764, 198, 198, 18, 64748, 62847, 56604, 54384, 8248, 6176, 50394, 52189, 50313, 50614, 61283, 9129, 53459, 66122, 63661, 50257, 56723, 52427, 79535, 72227, 40792, 50257, 51436, 67464, 21410, 55794, 53312, 53340, 16764, 198, 198, 19, 64748, 73713, 55794, 54384, 464, 24936, 56866, 50614, 50865, 53701, 50285, 78675, 9129, 53850, 53534, 60431, 63661, 50257, 56723, 52427, 55903, 51113, 97202, 51113, 53312, 57832, 16764, 198, 198, 20, 64748, 92567, 54384, 44501, 56866, 50614, 50363, 88026, 9129, 96919, 63661, 50257, 56723, 50890, 50810, 96601, 56254, 50584, 56035, 57043, 58967, 66120, 54999, 50956, 52707, 55409, 16764, 106068] decode_line = tokenizer.decode(tokens) print(decode_line) for token in tokens: print(token, tokenizer.decode([token]))