ai-forever commited on
Commit
e7529ab
1 Parent(s): 527ace1

Improve tokenizer backwards compatibility

Browse files
added_tokens.json DELETED
@@ -1,102 +0,0 @@
1
- {
2
- "<extra_id_0>": 32099,
3
- "<extra_id_10>": 32089,
4
- "<extra_id_11>": 32088,
5
- "<extra_id_12>": 32087,
6
- "<extra_id_13>": 32086,
7
- "<extra_id_14>": 32085,
8
- "<extra_id_15>": 32084,
9
- "<extra_id_16>": 32083,
10
- "<extra_id_17>": 32082,
11
- "<extra_id_18>": 32081,
12
- "<extra_id_19>": 32080,
13
- "<extra_id_1>": 32098,
14
- "<extra_id_20>": 32079,
15
- "<extra_id_21>": 32078,
16
- "<extra_id_22>": 32077,
17
- "<extra_id_23>": 32076,
18
- "<extra_id_24>": 32075,
19
- "<extra_id_25>": 32074,
20
- "<extra_id_26>": 32073,
21
- "<extra_id_27>": 32072,
22
- "<extra_id_28>": 32071,
23
- "<extra_id_29>": 32070,
24
- "<extra_id_2>": 32097,
25
- "<extra_id_30>": 32069,
26
- "<extra_id_31>": 32068,
27
- "<extra_id_32>": 32067,
28
- "<extra_id_33>": 32066,
29
- "<extra_id_34>": 32065,
30
- "<extra_id_35>": 32064,
31
- "<extra_id_36>": 32063,
32
- "<extra_id_37>": 32062,
33
- "<extra_id_38>": 32061,
34
- "<extra_id_39>": 32060,
35
- "<extra_id_3>": 32096,
36
- "<extra_id_40>": 32059,
37
- "<extra_id_41>": 32058,
38
- "<extra_id_42>": 32057,
39
- "<extra_id_43>": 32056,
40
- "<extra_id_44>": 32055,
41
- "<extra_id_45>": 32054,
42
- "<extra_id_46>": 32053,
43
- "<extra_id_47>": 32052,
44
- "<extra_id_48>": 32051,
45
- "<extra_id_49>": 32050,
46
- "<extra_id_4>": 32095,
47
- "<extra_id_50>": 32049,
48
- "<extra_id_51>": 32048,
49
- "<extra_id_52>": 32047,
50
- "<extra_id_53>": 32046,
51
- "<extra_id_54>": 32045,
52
- "<extra_id_55>": 32044,
53
- "<extra_id_56>": 32043,
54
- "<extra_id_57>": 32042,
55
- "<extra_id_58>": 32041,
56
- "<extra_id_59>": 32040,
57
- "<extra_id_5>": 32094,
58
- "<extra_id_60>": 32039,
59
- "<extra_id_61>": 32038,
60
- "<extra_id_62>": 32037,
61
- "<extra_id_63>": 32036,
62
- "<extra_id_64>": 32035,
63
- "<extra_id_65>": 32034,
64
- "<extra_id_66>": 32033,
65
- "<extra_id_67>": 32032,
66
- "<extra_id_68>": 32031,
67
- "<extra_id_69>": 32030,
68
- "<extra_id_6>": 32093,
69
- "<extra_id_70>": 32029,
70
- "<extra_id_71>": 32028,
71
- "<extra_id_72>": 32027,
72
- "<extra_id_73>": 32026,
73
- "<extra_id_74>": 32025,
74
- "<extra_id_75>": 32024,
75
- "<extra_id_76>": 32023,
76
- "<extra_id_77>": 32022,
77
- "<extra_id_78>": 32021,
78
- "<extra_id_79>": 32020,
79
- "<extra_id_7>": 32092,
80
- "<extra_id_80>": 32019,
81
- "<extra_id_81>": 32018,
82
- "<extra_id_82>": 32017,
83
- "<extra_id_83>": 32016,
84
- "<extra_id_84>": 32015,
85
- "<extra_id_85>": 32014,
86
- "<extra_id_86>": 32013,
87
- "<extra_id_87>": 32012,
88
- "<extra_id_88>": 32011,
89
- "<extra_id_89>": 32010,
90
- "<extra_id_8>": 32091,
91
- "<extra_id_90>": 32009,
92
- "<extra_id_91>": 32008,
93
- "<extra_id_92>": 32007,
94
- "<extra_id_93>": 32006,
95
- "<extra_id_94>": 32005,
96
- "<extra_id_95>": 32004,
97
- "<extra_id_96>": 32003,
98
- "<extra_id_97>": 32002,
99
- "<extra_id_98>": 32001,
100
- "<extra_id_99>": 32000,
101
- "<extra_id_9>": 32090
102
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json CHANGED
@@ -101,25 +101,7 @@
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
- "eos_token": {
105
- "content": "</s>",
106
- "lstrip": false,
107
- "normalized": false,
108
- "rstrip": false,
109
- "single_word": false
110
- },
111
- "pad_token": {
112
- "content": "<pad>",
113
- "lstrip": false,
114
- "normalized": false,
115
- "rstrip": false,
116
- "single_word": false
117
- },
118
- "unk_token": {
119
- "content": "<unk>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false
124
- }
125
  }
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  }
tokenizer_config.json CHANGED
@@ -927,12 +927,9 @@
927
  "<extra_id_98>",
928
  "<extra_id_99>"
929
  ],
930
- "clean_up_tokenization_spaces": true,
931
  "eos_token": "</s>",
932
  "extra_ids": 100,
933
- "legacy": true,
934
  "pad_token": "<pad>",
935
- "sp_model_kwargs": {},
936
  "tokenizer_class": "T5Tokenizer",
937
  "unk_token": "<unk>"
938
  }
927
  "<extra_id_98>",
928
  "<extra_id_99>"
929
  ],
 
930
  "eos_token": "</s>",
931
  "extra_ids": 100,
 
932
  "pad_token": "<pad>",
 
933
  "tokenizer_class": "T5Tokenizer",
934
  "unk_token": "<unk>"
935
  }