.gitattributes CHANGED
@@ -34,5 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  snowman.png filter=lfs diff=lfs merge=lfs -text
37
- pikachu.png filter=lfs diff=lfs merge=lfs -text
38
- pikachu_bbox.png filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  snowman.png filter=lfs diff=lfs merge=lfs -text
 
 
README.md CHANGED
@@ -5,12 +5,6 @@
5
  ---
6
  # Kosmos-2: Grounding Multimodal Large Language Models to the World
7
 
8
- **This model (remote code on the Hub) is deprecated. Please use https://huggingface.co/microsoft/kosmos-2-patch14-224**
9
-
10
- **There are some changes in terms of input formats: see the model card in https://huggingface.co/microsoft/kosmos-2-patch14-224**
11
-
12
- ~~**(There is an on going effort to port `Kosmos-2` directly into `transformers`. This repository (remote code) might need some more bug fixes later, including breaking changes.)**~~
13
-
14
  <a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><figure><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="384"><figcaption><b>[An image of a snowman warming himself by a fire.]</b></figcaption></figure></a>
15
 
16
 
 
5
  ---
6
  # Kosmos-2: Grounding Multimodal Large Language Models to the World
7
 
 
 
 
 
 
 
8
  <a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><figure><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="384"><figcaption><b>[An image of a snowman warming himself by a fire.]</b></figcaption></figure></a>
9
 
10
 
modeling_kosmos2.py CHANGED
@@ -22,7 +22,6 @@ from typing import List, Optional, Tuple, Union
22
  import torch
23
  import torch.utils.checkpoint
24
  from torch import nn
25
- from torch.nn import CrossEntropyLoss
26
 
27
  from transformers.activations import ACT2FN
28
  from transformers.modeling_outputs import (
@@ -1008,7 +1007,7 @@ class Kosmos2TextTransformer(nn.Module):
1008
  inputs_embeds = self.embed_tokens(input_ids)
1009
 
1010
  if img_features is not None:
1011
- inputs_embeds[img_input_mask.to(dtype=torch.bool)] = img_features.view(-1, img_features.size(-1))
1012
 
1013
  inputs_embeds = inputs_embeds * self.embed_scale
1014
 
 
22
  import torch
23
  import torch.utils.checkpoint
24
  from torch import nn
 
25
 
26
  from transformers.activations import ACT2FN
27
  from transformers.modeling_outputs import (
 
1007
  inputs_embeds = self.embed_tokens(input_ids)
1008
 
1009
  if img_features is not None:
1010
+ inputs_embeds[img_input_mask.to(dtype=torch.bool)] = img_features
1011
 
1012
  inputs_embeds = inputs_embeds * self.embed_scale
1013
 
pikachu.png DELETED

Git LFS Details

  • SHA256: 7bf04b0f0b3191819ade6bd8b6c7cb388636a010b1e812e05a746564f3c9d306
  • Pointer size: 132 Bytes
  • Size of remote file: 1.18 MB
pikachu.webp DELETED
Binary file (35.4 kB)
 
pikachu_bbox.png DELETED

Git LFS Details

  • SHA256: f1c4fa11a4aea7c573949e747d11a0da3142e39ee986df40eebfd10986395ecc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB
tokenization_kosmos2_fast.py CHANGED
@@ -137,6 +137,7 @@ class Kosmos2TokenizerFast(PreTrainedTokenizerFast):
137
  )
138
 
139
  self.vocab_file = vocab_file
 
140
 
141
  self.eod_token = "</doc>"
142
 
@@ -178,10 +179,6 @@ class Kosmos2TokenizerFast(PreTrainedTokenizerFast):
178
  # we need to set `special_tokens=False` to be the same as in the slow tokenizer.
179
  self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=False)
180
 
181
- @property
182
- def can_save_slow_tokenizer(self) -> bool:
183
- return os.path.isfile(self.vocab_file) if self.vocab_file else False
184
-
185
  def build_inputs_with_special_tokens(
186
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
187
  ) -> List[int]:
 
137
  )
138
 
139
  self.vocab_file = vocab_file
140
+ self.can_save_slow_tokenizer = False if not self.vocab_file else True
141
 
142
  self.eod_token = "</doc>"
143
 
 
179
  # we need to set `special_tokens=False` to be the same as in the slow tokenizer.
180
  self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=False)
181
 
 
 
 
 
182
  def build_inputs_with_special_tokens(
183
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
184
  ) -> List[int]: