Alexandru Gherghescu commited on
Commit
15c1815
1 Parent(s): bbb5d39

Fix tokenizer

Browse files

The tokenizer is the original tokenizer used by the GPT1 model, created
from the vocab and merges contained in their repo.

Files changed (2) hide show
  1. configuration_gpt1.py +1 -1
  2. tokenizer.json +6 -63
configuration_gpt1.py CHANGED
@@ -8,7 +8,7 @@ class GPT1Config(PretrainedConfig):
8
 
9
  def __init__(
10
  self,
11
- vocab_size=40000,
12
  hidden_size=768,
13
  intermediate_size=3072,
14
  num_hidden_layers=12,
 
8
 
9
  def __init__(
10
  self,
11
+ vocab_size=40478,
12
  hidden_size=768,
13
  intermediate_size=3072,
14
  num_hidden_layers=12,
tokenizer.json CHANGED
@@ -17,68 +17,11 @@
17
  "type": "Sequence",
18
  "normalizers": [
19
  {
20
- "type": "Replace",
21
- "pattern": {
22
- "String": "—"
23
- },
24
- "content": "-"
25
- },
26
- {
27
- "type": "Replace",
28
- "pattern": {
29
- "String": "–"
30
- },
31
- "content": "-"
32
- },
33
- {
34
- "type": "Replace",
35
- "pattern": {
36
- "String": "―"
37
- },
38
- "content": "-"
39
- },
40
- {
41
- "type": "Replace",
42
- "pattern": {
43
- "String": "…"
44
- },
45
- "content": "..."
46
- },
47
- {
48
- "type": "Replace",
49
- "pattern": {
50
- "String": "´"
51
- },
52
- "content": "'"
53
- },
54
- {
55
- "type": "Replace",
56
- "pattern": {
57
- "Regex": "(-+|~+|!+|\"+|;+|\\?+|\\++|,+|\\)+|\\(+|\\+|\\/+|\\*+|\\[+|\\]+|}+|{+|\\|+|_+)"
58
- },
59
- "content": " \\1 "
60
- },
61
- {
62
- "type": "Replace",
63
- "pattern": {
64
- "Regex": "\\s*\n\\s*"
65
- },
66
- "content": " \n "
67
- },
68
- {
69
- "type": "Replace",
70
- "pattern": {
71
- "Regex": "[^\\S\n]+"
72
- },
73
- "content": " "
74
- },
75
- {
76
- "type": "Strip",
77
- "strip_left": true,
78
- "strip_right": true
79
- },
80
- {
81
- "type": "Lowercase"
82
  }
83
  ]
84
  },
@@ -80586,4 +80529,4 @@
80586
  "bachel orette</w>"
80587
  ]
80588
  }
80589
- }
 
17
  "type": "Sequence",
18
  "normalizers": [
19
  {
20
+ "type": "BertNormalizer",
21
+ "clean_text": true,
22
+ "handle_chinese_chars": true,
23
+ "strip_accents": null,
24
+ "lowercase": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ]
27
  },
 
80529
  "bachel orette</w>"
80530
  ]
80531
  }
80532
+ }