Update files.

Rollback to 13b config.

Files changed (5) hide show

README.md CHANGED Viewed

+# SoM-LLaVA Model Card
+LLaVA-v1.5 mixed trained with SoM style data (QA+listing).
+The model can understand tag-style visual prompts on the image (e.g., what is the object tagged with id 9?), also gained improved performance on MLLM benchmarks (POPE, MME, SEED, MM-Vet, LLav-wild), even when the input testing images has no tags.
+**For more information about SoM-LLaVA, check our [github page](https://github.com/zzxslp/SoM-LLaVA) and [paper](https://arxiv.org/abs/2404.16375)!**
+## Getting Started
+This model should be used in the [official LLaVA repo](https://github.com/haotian-liu/LLaVA) for training and evalution.
+If you would like to load the model in HF style, check the converted model weights: [[SoM-LLaVA-v1.5-13B-HF](https://huggingface.co/zzxslp/som-llava-v1.5-13b-hf)]
+## Citation
+If you find our data or model useful for your research and applications, please cite our paper:
+```
+@article{yan2024list,
+  title={List Items One by One: A New Data Source and Learning Paradigm for Multimodal LLMs},
+  author={Yan, An and Yang, Zhengyuan and Wu, Junda and Zhu, Wanrong and Yang, Jianwei and Li, Linjie and Lin, Kevin and Wang, Jianfeng and McAuley, Julian and Gao, Jianfeng and others},
+  journal={arXiv preprint arXiv:2404.16375},
+  year={2024}
+}
+```

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "lmsys/vicuna-7b-v1.5",
   "architectures": [
     "LlavaLlamaForCausalLM"
   ],
@@ -9,10 +9,11 @@
   "eos_token_id": 2,
   "freeze_mm_mlp_adapter": false,
   "hidden_act": "silu",
-  "hidden_size": 4096,
   "image_aspect_ratio": "pad",
   "initializer_range": 0.02,
-  "intermediate_size": 11008,
   "max_position_embeddings": 4096,
   "mm_hidden_size": 1024,
   "mm_patch_merge_type": "flat",
@@ -24,9 +25,9 @@
   "mm_vision_select_layer": -2,
   "mm_vision_tower": "openai/clip-vit-large-patch14-336",
   "model_type": "llava_llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 32,
   "pad_token_id": 0,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,

 {
+  "_name_or_path": "lmsys/vicuna-13b-v1.5",
   "architectures": [
     "LlavaLlamaForCausalLM"
   ],
   "eos_token_id": 2,
   "freeze_mm_mlp_adapter": false,
   "hidden_act": "silu",
+  "hidden_size": 5120,
   "image_aspect_ratio": "pad",
   "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "max_length": 4096,
   "max_position_embeddings": 4096,
   "mm_hidden_size": 1024,
   "mm_patch_merge_type": "flat",
   "mm_vision_select_layer": -2,
   "mm_vision_tower": "openai/clip-vit-large-patch14-336",
   "model_type": "llava_llama",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 40,
   "pad_token_id": 0,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb22bf99f080124c0718401eae0ef4051eec5414d42f4d30255abb939e653e90
-size 6840

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fd01f30aac4b8383bdab58dd6eb40b9ac1195887d11b4398eca3bae240218fc
+size 6904