cyq1998 commited on
Commit
bb71c89
1 Parent(s): 2ced267

solve "cannot load pytorch checkpoint issue" and fix tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ SimSun.ttf filter=lfs diff=lfs merge=lfs -text
SimSun.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca4da082cd970f0c8abaa79f213ddcbc475f7b5afabcb81b385998f9ebfbb53f
3
+ size 10499104
config.json CHANGED
@@ -35,7 +35,7 @@
35
  "visual": {
36
  "heads": 16,
37
  "image_size": 448,
38
- "image_start_id": 50470,
39
  "layers": 48,
40
  "mlp_ratio": 4.9231,
41
  "output_dim": 4096,
@@ -43,4 +43,4 @@
43
  "width": 1664
44
  },
45
  "vocab_size": 51200
46
- }
 
35
  "visual": {
36
  "heads": 16,
37
  "image_size": 448,
38
+ "image_start_id": 50508,
39
  "layers": 48,
40
  "mlp_ratio": 4.9231,
41
  "output_dim": 4096,
 
43
  "width": 1664
44
  },
45
  "vocab_size": 51200
46
+ }
modeling_vitphi.py CHANGED
@@ -45,8 +45,8 @@ from einops import rearrange
45
  from transformers.activations import ACT2FN
46
  from transformers import PretrainedConfig, PreTrainedModel
47
  from transformers.modeling_outputs import CausalLMOutputWithPast
48
- from .configuration_vitphi import MixFormerVLSequentialConfig
49
- from .visual import VisionTransformer
50
  # from configuration_vitphi import MixFormerVLSequentialConfig
51
  # from visual import VisionTransformer
52
 
 
45
  from transformers.activations import ACT2FN
46
  from transformers import PretrainedConfig, PreTrainedModel
47
  from transformers.modeling_outputs import CausalLMOutputWithPast
48
+ from configuration_vitphi import MixFormerVLSequentialConfig
49
+ from visual import VisionTransformer
50
  # from configuration_vitphi import MixFormerVLSequentialConfig
51
  # from visual import VisionTransformer
52
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45305856b9096f2473b8822eae24ba326387a7d0be0a1ec4f2f862f9af0f1011
3
- size 6724937785
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:196df56fad9a8cda2cfd334a4c75dd21a5fc4522f2ed28ddb02e8ff50c31de4d
3
+ size 6726979333
tokenization_vitphi.py CHANGED
@@ -44,12 +44,26 @@ IMEND = "<|im_end|>"
44
  # as different as possible to minimize the impact
45
  EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
46
  SPECIAL_TOKENS = (
47
- # ENDOFTEXT,
48
  IMSTART,
49
  IMEND,
50
  ) + EXTRAS
51
  IMG_TOKEN_SPAN = 256
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
55
  with open(tiktoken_bpe_file, "rb") as f:
@@ -119,9 +133,22 @@ class VitPhiTokenizer(PreTrainedTokenizer):
119
  box_end_tag='</box>',
120
  quad_start_tag='<quad>',
121
  quad_end_tag='</quad>',
 
 
 
 
122
  **kwargs,
123
  ):
124
- super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
125
  self.image_start_tag = image_start_tag
126
  self.image_end_tag = image_end_tag
127
  self.image_pad_tag = image_pad_tag
@@ -140,14 +167,17 @@ class VitPhiTokenizer(PreTrainedTokenizer):
140
  )
141
 
142
  self.errors = errors # how to handle errors in decoding
 
143
 
144
  self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
145
  self.special_tokens = {
146
  token: index
147
  for index, token in enumerate(
148
- SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
149
  )
150
  }
 
 
151
  self.img_start_id = self.special_tokens[self.image_start_tag]
152
  self.img_end_id = self.special_tokens[self.image_end_tag]
153
  self.img_pad_id = self.special_tokens[self.image_pad_tag]
@@ -165,8 +195,8 @@ class VitPhiTokenizer(PreTrainedTokenizer):
165
  special_tokens=self.special_tokens,
166
  )
167
  assert (
168
- len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
169
- ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
170
 
171
  self.decoder = {
172
  v: k for k, v in self.mergeable_ranks.items()
@@ -174,9 +204,9 @@ class VitPhiTokenizer(PreTrainedTokenizer):
174
  self.decoder.update({v: k for k, v in self.special_tokens.items()})
175
 
176
  self.tokenizer = enc # type: tiktoken.Encoding
177
-
178
- self.eod_id = self.tokenizer.eot_token
179
  self.im_start_id = self.special_tokens[IMSTART]
 
180
  self.im_end_id = self.special_tokens[IMEND]
181
 
182
  def __len__(self) -> int:
@@ -251,12 +281,14 @@ class VitPhiTokenizer(PreTrainedTokenizer):
251
  `List[bytes|str]`: The list of tokens.
252
  """
253
  tokens = []
 
 
254
  text = unicodedata.normalize("NFC", text)
 
255
 
256
  # this implementation takes a detour: text -> token id -> token surface forms
257
  for t in self.tokenizer.encode(
258
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
259
- ):
260
  tokens.append(self.decoder[t])
261
 
262
  def _encode_imgurl(img_tokens):
 
44
  # as different as possible to minimize the impact
45
  EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
46
  SPECIAL_TOKENS = (
47
+ ENDOFTEXT,
48
  IMSTART,
49
  IMEND,
50
  ) + EXTRAS
51
  IMG_TOKEN_SPAN = 256
52
 
53
+ def bytes_to_unicode():
54
+ bs = (
55
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
56
+ )
57
+ cs = bs[:]
58
+ n = 0
59
+ for b in range(2**8):
60
+ if b not in bs:
61
+ bs.append(b)
62
+ cs.append(2**8 + n)
63
+ n += 1
64
+ cs = [chr(n) for n in cs]
65
+ return dict(zip(bs, cs))
66
+
67
 
68
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
69
  with open(tiktoken_bpe_file, "rb") as f:
 
133
  box_end_tag='</box>',
134
  quad_start_tag='<quad>',
135
  quad_end_tag='</quad>',
136
+ unk_token="<|endoftext|>",
137
+ bos_token="<|endoftext|>",
138
+ eos_token="<|endoftext|>",
139
+ pad_token=None,
140
  **kwargs,
141
  ):
142
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
143
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
144
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
145
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
146
+ super().__init__(errors=errors,
147
+ unk_token=unk_token,
148
+ bos_token=bos_token,
149
+ eos_token=eos_token,
150
+ pad_token=pad_token,
151
+ **kwargs)
152
  self.image_start_tag = image_start_tag
153
  self.image_end_tag = image_end_tag
154
  self.image_pad_tag = image_pad_tag
 
167
  )
168
 
169
  self.errors = errors # how to handle errors in decoding
170
+ self.byte_encoder = bytes_to_unicode()
171
 
172
  self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
173
  self.special_tokens = {
174
  token: index
175
  for index, token in enumerate(
176
+ SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)-1
177
  )
178
  }
179
+ self.special_tokens[ENDOFTEXT] = 50256
180
+ # print(self.special_tokens)
181
  self.img_start_id = self.special_tokens[self.image_start_tag]
182
  self.img_end_id = self.special_tokens[self.image_end_tag]
183
  self.img_pad_id = self.special_tokens[self.image_pad_tag]
 
195
  special_tokens=self.special_tokens,
196
  )
197
  assert (
198
+ len(self.mergeable_ranks) + len(self.special_tokens) - 1 == enc.n_vocab # has a common word
199
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} - 1 != {enc.n_vocab} in encoding"
200
 
201
  self.decoder = {
202
  v: k for k, v in self.mergeable_ranks.items()
 
204
  self.decoder.update({v: k for k, v in self.special_tokens.items()})
205
 
206
  self.tokenizer = enc # type: tiktoken.Encoding
207
+
 
208
  self.im_start_id = self.special_tokens[IMSTART]
209
+ self.eod_id = self.im_start_id - 1
210
  self.im_end_id = self.special_tokens[IMEND]
211
 
212
  def __len__(self) -> int:
 
281
  `List[bytes|str]`: The list of tokens.
282
  """
283
  tokens = []
284
+ text = "".join([self.byte_encoder[b] for b in text.encode("utf-8")])
285
+ #text = text.replace(" ", self.byte_encoder[" ".encode("utf-8")[0]])
286
  text = unicodedata.normalize("NFC", text)
287
+ #print("----after nfc------:", text)
288
 
289
  # this implementation takes a detour: text -> token id -> token surface forms
290
  for t in self.tokenizer.encode(
291
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special ):
 
292
  tokens.append(self.decoder[t])
293
 
294
  def _encode_imgurl(img_tokens):
vocab.tiktoken CHANGED
The diff for this file is too large to render. See raw diff