Update

Browse files

Files changed (10) hide show

README.md +5 -7
config.json +3 -3
generation_config.json +1 -1
model-00001-of-00004.safetensors +1 -1
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
modeling_bunny_llama.py +1 -1
tokenizer.json +59 -56
tokenizer_config.json +1 -13

README.md CHANGED Viewed

@@ -9,19 +9,17 @@ license: apache-2.0
   <img src="./icon.png" alt="Logo" width="350">
 </p>
-📖 [Technical report](https://arxiv.org/abs/2402.11530) | 🏠 [Code](https://github.com/BAAI-DCAI/Bunny) | 🐰 [Demo](https://wisemodel.cn/spaces/baai/Bunny)
 This is Bunny-Llama-3-8B-V.
 Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2 and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source.
-We provide Bunny-Llama-3-8B-V, which is built upon [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) and [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B).
-The model is pretrained on LAION-2M and finetuned on Bunny-695K. More details about this model can be found in [GitHub](https://github.com/BAAI-DCAI/Bunny).
-|                    | MME \\(^{\text{P}}\\) | MME \\(^{\text{C}}\\) | MMB \\(^{\text{T/D}}\\) | SEED | MMMU \\(^{\text{V/T}}\\) | VQA \\(^{\text{v2}}\\) | GQA  | SQA \\(^{\text{I}}\\) | POPE |
 | ------------------ | :--------------: | :--------------: | :----------------: | :--: | :-----------------: | :---------------: | :--: | :--------------: | :--: |
-| Bunny-Llama-3-8B-V |      1571.8      |      297.1       |     74.3/74.0      | 65.1 |      39.1/35.4      |       81.94       | 63.7 |       74.3       | 86.7 |
@@ -65,7 +63,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 prompt = 'Why is the image funny?'
 text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
 text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
-input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
 # image, sample images can be found in images folder
 image = Image.open('example_2.png')

   <img src="./icon.png" alt="Logo" width="350">
 </p>
+📖 [Technical report](https://arxiv.org/abs/2402.11530) | 🏠 [Code](https://github.com/BAAI-DCAI/Bunny) | 🐰 [3B Demo](https://wisemodel.cn/spaces/baai/Bunny) | 🐰 [8B Demo](https://252412006bcde38bfa.gradio.live/)
 This is Bunny-Llama-3-8B-V.
 Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2 and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source.
+We provide Bunny-Llama-3-8B-V, which is built upon [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) and [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B). More details about this model can be found in [GitHub](https://github.com/BAAI-DCAI/Bunny).
+|                    | MME \\(^{\text{P}}\\) | MME \\(^{\text{C}}\\) | MMB \\(^{\text{T/D}}\\) | SEED(-IMG) | MMMU \\(^{\text{V/T}}\\) | VQA \\(^{\text{v2}}\\) | GQA  | SQA \\(^{\text{I}}\\) | POPE |
 | ------------------ | :--------------: | :--------------: | :----------------: | :--: | :-----------------: | :---------------: | :--: | :--------------: | :--: |
+| Bunny-Llama-3-8B-V |      1592.2      |      335.0      |     76.2/75.6     | 66.0(73.3) |      39.7/36.8      |       82.5       | 64.4 |       75.7       | 87.6 |
 prompt = 'Why is the image funny?'
 text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
 text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
+input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
 # image, sample images can be found in images folder
 image = Image.open('example_2.png')

config.json CHANGED Viewed

@@ -19,7 +19,7 @@
   "intermediate_size": 14336,
   "max_position_embeddings": 8192,
   "mm_hidden_size": 1152,
-  "mm_projector_lr": 2e-05,
   "mm_projector_type": "mlp2x_gelu",
   "mm_vision_tower": "google/siglip-so400m-patch14-384",
   "model_type": "bunny-llama",
@@ -34,9 +34,9 @@
   "tokenizer_model_max_length": 2048,
   "tokenizer_padding_side": "right",
   "torch_dtype": "float16",
-  "transformers_version": "4.38.2",
   "tune_mm_mlp_adapter": false,
-  "unfreeze_vision_tower": false,
   "use_cache": true,
   "use_mm_proj": true,
   "vocab_size": 128257

   "intermediate_size": 14336,
   "max_position_embeddings": 8192,
   "mm_hidden_size": 1152,
+  "mm_projector_lr": 1e-05,
   "mm_projector_type": "mlp2x_gelu",
   "mm_vision_tower": "google/siglip-so400m-patch14-384",
   "model_type": "bunny-llama",
   "tokenizer_model_max_length": 2048,
   "tokenizer_padding_side": "right",
   "torch_dtype": "float16",
+  "transformers_version": "4.40.0",
   "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
   "use_cache": true,
   "use_mm_proj": true,
   "vocab_size": 128257

generation_config.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "bos_token_id": 128000,
   "eos_token_id": 128001,
   "pad_token_id": 128001,
-  "transformers_version": "4.38.2"
 }

   "bos_token_id": 128000,
   "eos_token_id": 128001,
   "pad_token_id": 128001,
+  "transformers_version": "4.40.0"
 }

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72d105f11150862dc188687e07b20cf28e5b53b47750889fc76c99ab8768c342
 size 4976706784

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ec0a7fd9ad460c3e0a4531b16c290472d8d133c348e9649efa2e9458e936a39
 size 4976706784

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67b7dbce5a065f1591c9b7594a3afc5297f96af80ec3c62a8d92093e80ab0cd7
 size 4999802616

 version https://git-lfs.github.com/spec/v1
+oid sha256:ec257e1403e02aaac7d2d79e67ad4f5378c4b041037aa00862b1983b014729b1
 size 4999802616

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a831144fabf9cb4a781bee44c48864bf5e28416d8678e1f06870e019e537335
 size 4915916080

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb70b23deec1b9d9512676ba5c42ea0d3c980c1cba387fc9cb14e46be6153b08
 size 4915916080

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:912a671e9f42e978074747d1b90819eecdc4fb63563bc04fb203d82b909e0998
 size 2067676408

 version https://git-lfs.github.com/spec/v1
+oid sha256:423517f46806e703f787ecb8ac332c729f167baeaca7e500da9462583ed8cda1
 size 2067676408

modeling_bunny_llama.py CHANGED Viewed

@@ -604,7 +604,7 @@ class BunnyMetaModel:
         super(BunnyMetaModel, self).__init__(config)
         if hasattr(config, "mm_vision_tower"):
-            self.vision_tower = build_vision_tower(config, delay_load=True)
             self.mm_projector = build_vision_projector(config)
     def get_vision_tower(self):

         super(BunnyMetaModel, self).__init__(config)
         if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=False)
             self.mm_projector = build_vision_projector(config)
     def get_vision_tower(self):

tokenizer.json CHANGED Viewed

@@ -2306,15 +2306,6 @@
       "rstrip": false,
       "normalized": false,
       "special": true
-    },
-    {
-      "id": 128256,
-      "content": "<unk>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": null,
@@ -2338,58 +2329,69 @@
     ]
   },
   "post_processor": {
-    "type": "TemplateProcessing",
-    "single": [
-      {
-        "SpecialToken": {
-          "id": "<|begin_of_text|>",
-          "type_id": 0
-        }
-      },
-      {
-        "Sequence": {
-          "id": "A",
-          "type_id": 0
-        }
-      }
-    ],
-    "pair": [
-      {
-        "SpecialToken": {
-          "id": "<|begin_of_text|>",
-          "type_id": 0
-        }
-      },
-      {
-        "Sequence": {
-          "id": "A",
-          "type_id": 0
-        }
-      },
       {
-        "SpecialToken": {
-          "id": "<|begin_of_text|>",
-          "type_id": 1
-        }
       },
       {
-        "Sequence": {
-          "id": "B",
-          "type_id": 1
-        }
-      }
-    ],
-    "special_tokens": {
-      "<|begin_of_text|>": {
-        "id": "<|begin_of_text|>",
-        "ids": [
-          128000
         ],
-        "tokens": [
-          "<|begin_of_text|>"
-        ]
       }
-    }
   },
   "decoder": {
     "type": "ByteLevel",
@@ -2405,6 +2407,7 @@
     "end_of_word_suffix": null,
     "fuse_unk": false,
     "byte_fallback": false,
     "vocab": {
       "!": 0,
       "\"": 1,

       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": null,
     ]
   },
   "post_processor": {
+    "type": "Sequence",
+    "processors": [
       {
+        "type": "ByteLevel",
+        "add_prefix_space": true,
+        "trim_offsets": false,
+        "use_regex": true
       },
       {
+        "type": "TemplateProcessing",
+        "single": [
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 0
+            }
+          },
+          {
+            "Sequence": {
+              "id": "A",
+              "type_id": 0
+            }
+          }
+        ],
+        "pair": [
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 0
+            }
+          },
+          {
+            "Sequence": {
+              "id": "A",
+              "type_id": 0
+            }
+          },
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 1
+            }
+          },
+          {
+            "Sequence": {
+              "id": "B",
+              "type_id": 1
+            }
+          }
         ],
+        "special_tokens": {
+          "<|begin_of_text|>": {
+            "id": "<|begin_of_text|>",
+            "ids": [
+              128000
+            ],
+            "tokens": [
+              "<|begin_of_text|>"
+            ]
+          }
+        }
       }
+    ]
   },
   "decoder": {
     "type": "ByteLevel",
     "end_of_word_suffix": null,
     "fuse_unk": false,
     "byte_fallback": false,
+    "ignore_merges": true,
     "vocab": {
       "!": 0,
       "\"": 1,

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,4 @@
 {
-  "add_bos_token": true,
-  "add_eos_token": false,
   "added_tokens_decoder": {
     "128000": {
       "content": "<|begin_of_text|>",
@@ -2049,14 +2047,6 @@
       "rstrip": false,
       "single_word": false,
       "special": true
-    },
-    "128256": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
     }
   },
   "bos_token": "<|begin_of_text|>",
@@ -2067,7 +2057,5 @@
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
-  "tokenizer_class": "LlamaTokenizer",
-  "unk_token": "<unk>",
-  "use_default_system_prompt": false
 }

 {
   "added_tokens_decoder": {
     "128000": {
       "content": "<|begin_of_text|>",
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "<|begin_of_text|>",
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast"
 }