EddieChen372 commited on
Commit
ef59ff8
1 Parent(s): 39598bf

add tokenizer

Browse files
Files changed (2) hide show
  1. special_tokens_map.json +8 -1
  2. tokenizer.json +59 -8
special_tokens_map.json CHANGED
@@ -1,3 +1,10 @@
1
  {
2
- "bos_token": "<|endoftext|>"
 
 
 
 
 
 
 
3
  }
1
  {
2
+ "additional_special_tokens": [
3
+ "[react]",
4
+ "[end]",
5
+ "[jest]"
6
+ ],
7
+ "bos_token": "<|endoftext|>",
8
+ "eos_token": "<|endoftext|>",
9
+ "pad_token": "<|endoftext|>"
10
  }
tokenizer.json CHANGED
@@ -2324,6 +2324,33 @@
2324
  "rstrip": false,
2325
  "normalized": false,
2326
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2327
  }
2328
  ],
2329
  "normalizer": null,
@@ -2350,14 +2377,14 @@
2350
  "type": "TemplateProcessing",
2351
  "single": [
2352
  {
2353
- "SpecialToken": {
2354
- "id": "<|endoftext|>",
2355
  "type_id": 0
2356
  }
2357
  },
2358
  {
2359
- "Sequence": {
2360
- "id": "A",
2361
  "type_id": 0
2362
  }
2363
  }
@@ -2365,7 +2392,7 @@
2365
  "pair": [
2366
  {
2367
  "SpecialToken": {
2368
- "id": "<|endoftext|>",
2369
  "type_id": 0
2370
  }
2371
  },
@@ -2377,14 +2404,20 @@
2377
  },
2378
  {
2379
  "SpecialToken": {
2380
- "id": "<|endoftext|>",
2381
- "type_id": 1
2382
  }
2383
  },
2384
  {
2385
  "Sequence": {
2386
  "id": "B",
2387
- "type_id": 1
 
 
 
 
 
 
2388
  }
2389
  }
2390
  ],
@@ -2397,6 +2430,24 @@
2397
  "tokens": [
2398
  "<|endoftext|>"
2399
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2400
  }
2401
  }
2402
  },
2324
  "rstrip": false,
2325
  "normalized": false,
2326
  "special": true
2327
+ },
2328
+ {
2329
+ "id": 52000,
2330
+ "content": "[react]",
2331
+ "single_word": false,
2332
+ "lstrip": false,
2333
+ "rstrip": false,
2334
+ "normalized": false,
2335
+ "special": true
2336
+ },
2337
+ {
2338
+ "id": 52001,
2339
+ "content": "[end]",
2340
+ "single_word": false,
2341
+ "lstrip": false,
2342
+ "rstrip": false,
2343
+ "normalized": false,
2344
+ "special": true
2345
+ },
2346
+ {
2347
+ "id": 52002,
2348
+ "content": "[jest]",
2349
+ "single_word": false,
2350
+ "lstrip": false,
2351
+ "rstrip": false,
2352
+ "normalized": false,
2353
+ "special": true
2354
  }
2355
  ],
2356
  "normalizer": null,
2377
  "type": "TemplateProcessing",
2378
  "single": [
2379
  {
2380
+ "Sequence": {
2381
+ "id": "A",
2382
  "type_id": 0
2383
  }
2384
  },
2385
  {
2386
+ "SpecialToken": {
2387
+ "id": "<|endoftext|>",
2388
  "type_id": 0
2389
  }
2390
  }
2392
  "pair": [
2393
  {
2394
  "SpecialToken": {
2395
+ "id": "[react]",
2396
  "type_id": 0
2397
  }
2398
  },
2404
  },
2405
  {
2406
  "SpecialToken": {
2407
+ "id": "[jest]",
2408
+ "type_id": 0
2409
  }
2410
  },
2411
  {
2412
  "Sequence": {
2413
  "id": "B",
2414
+ "type_id": 0
2415
+ }
2416
+ },
2417
+ {
2418
+ "SpecialToken": {
2419
+ "id": "<|endoftext|>",
2420
+ "type_id": 0
2421
  }
2422
  }
2423
  ],
2430
  "tokens": [
2431
  "<|endoftext|>"
2432
  ]
2433
+ },
2434
+ "[jest]": {
2435
+ "id": "[jest]",
2436
+ "ids": [
2437
+ 52002
2438
+ ],
2439
+ "tokens": [
2440
+ "[jest]"
2441
+ ]
2442
+ },
2443
+ "[react]": {
2444
+ "id": "[react]",
2445
+ "ids": [
2446
+ 52000
2447
+ ],
2448
+ "tokens": [
2449
+ "[react]"
2450
+ ]
2451
  }
2452
  }
2453
  },