zzxslp commited on
Commit
9af6ca3
1 Parent(s): ccd710d

Update files.

Browse files

Rollback to 13b config.

README.md CHANGED
@@ -1 +1,23 @@
1
- hello
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SoM-LLaVA Model Card
2
+ LLaVA-v1.5 mixed trained with SoM style data (QA+listing).
3
+
4
+ The model can understand tag-style visual prompts on the image (e.g., what is the object tagged with id 9?), also gained improved performance on MLLM benchmarks (POPE, MME, SEED, MM-Vet, LLav-wild), even when the input testing images has no tags.
5
+
6
+ **For more information about SoM-LLaVA, check our [github page](https://github.com/zzxslp/SoM-LLaVA) and [paper](https://arxiv.org/abs/2404.16375)!**
7
+
8
+ ## Getting Started
9
+ This model should be used in the [official LLaVA repo](https://github.com/haotian-liu/LLaVA) for training and evalution.
10
+
11
+ If you would like to load the model in HF style, check the converted model weights: [[SoM-LLaVA-v1.5-13B-HF](https://huggingface.co/zzxslp/som-llava-v1.5-13b-hf)]
12
+
13
+ ## Citation
14
+ If you find our data or model useful for your research and applications, please cite our paper:
15
+
16
+ ```
17
+ @article{yan2024list,
18
+ title={List Items One by One: A New Data Source and Learning Paradigm for Multimodal LLMs},
19
+ author={Yan, An and Yang, Zhengyuan and Wu, Junda and Zhu, Wanrong and Yang, Jianwei and Li, Linjie and Lin, Kevin and Wang, Jianfeng and McAuley, Julian and Gao, Jianfeng and others},
20
+ journal={arXiv preprint arXiv:2404.16375},
21
+ year={2024}
22
+ }
23
+ ```
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "lmsys/vicuna-7b-v1.5",
3
  "architectures": [
4
  "LlavaLlamaForCausalLM"
5
  ],
@@ -9,10 +9,11 @@
9
  "eos_token_id": 2,
10
  "freeze_mm_mlp_adapter": false,
11
  "hidden_act": "silu",
12
- "hidden_size": 4096,
13
  "image_aspect_ratio": "pad",
14
  "initializer_range": 0.02,
15
- "intermediate_size": 11008,
 
16
  "max_position_embeddings": 4096,
17
  "mm_hidden_size": 1024,
18
  "mm_patch_merge_type": "flat",
@@ -24,9 +25,9 @@
24
  "mm_vision_select_layer": -2,
25
  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
26
  "model_type": "llava_llama",
27
- "num_attention_heads": 32,
28
- "num_hidden_layers": 32,
29
- "num_key_value_heads": 32,
30
  "pad_token_id": 0,
31
  "pretraining_tp": 1,
32
  "rms_norm_eps": 1e-05,
 
1
  {
2
+ "_name_or_path": "lmsys/vicuna-13b-v1.5",
3
  "architectures": [
4
  "LlavaLlamaForCausalLM"
5
  ],
 
9
  "eos_token_id": 2,
10
  "freeze_mm_mlp_adapter": false,
11
  "hidden_act": "silu",
12
+ "hidden_size": 5120,
13
  "image_aspect_ratio": "pad",
14
  "initializer_range": 0.02,
15
+ "intermediate_size": 13824,
16
+ "max_length": 4096,
17
  "max_position_embeddings": 4096,
18
  "mm_hidden_size": 1024,
19
  "mm_patch_merge_type": "flat",
 
25
  "mm_vision_select_layer": -2,
26
  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
27
  "model_type": "llava_llama",
28
+ "num_attention_heads": 40,
29
+ "num_hidden_layers": 40,
30
+ "num_key_value_heads": 40,
31
  "pad_token_id": 0,
32
  "pretraining_tp": 1,
33
  "rms_norm_eps": 1e-05,
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb22bf99f080124c0718401eae0ef4051eec5414d42f4d30255abb939e653e90
3
- size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fd01f30aac4b8383bdab58dd6eb40b9ac1195887d11b4398eca3bae240218fc
3
+ size 6904