Upload LlavaOnevisionForConditionalGeneration

Files changed (4) hide show

README.md CHANGED Viewed

@@ -2,15 +2,15 @@
 language:
 - en
 - zh
-pipeline_tag: image-text-to-text
-inference: false
-arxiv: 2408.03326
 license: apache-2.0
 tags:
 - vision
 - image-text-to-text
 datasets:
 - lmms-lab/LLaVA-OneVision-Data
 ---
 # LLaVA-Onevision Model Card

 language:
 - en
 - zh
 license: apache-2.0
 tags:
 - vision
 - image-text-to-text
 datasets:
 - lmms-lab/LLaVA-OneVision-Data
+pipeline_tag: image-text-to-text
+inference: false
+arxiv: 2408.03326
 ---
 # LLaVA-Onevision Model Card

config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_name_or_path": "/raid/raushan/si-7b",
   "architectures": [
-    "LlavaNextForConditionalGeneration"
   ],
   "ignore_index": -100,
   "image_grid_pinpoints": [
@@ -151,7 +151,7 @@
     ]
   ],
   "image_token_index": 151646,
-  "model_type": "llava_next",
   "projector_hidden_act": "gelu",
   "text_config": {
     "_name_or_path": "Qwen/Qwen2-7B-Instruct",
@@ -162,29 +162,24 @@
     "eos_token_id": 151645,
     "hidden_size": 3584,
     "intermediate_size": 18944,
-    "max_position_embeddings": 32768,
-    "max_window_layers": 28,
     "model_type": "qwen2",
     "num_attention_heads": 28,
     "num_hidden_layers": 28,
     "num_key_value_heads": 4,
     "rope_theta": 1000000.0,
-    "sliding_window": null,
     "torch_dtype": "bfloat16",
-    "use_sliding_window": false,
     "vocab_size": 152128
   },
   "tie_word_embeddings": false,
   "torch_dtype": "float16",
   "transformers_version": "4.45.0.dev0",
   "use_image_newline_parameter": true,
   "vision_aspect_ratio": "anyres_max_9",
   "vision_config": {
-    "hidden_act": "gelu_pytorch_tanh",
     "hidden_size": 1152,
     "image_size": 384,
     "intermediate_size": 4304,
-    "layer_norm_eps": 1e-06,
     "model_type": "siglip_vision_model",
     "num_attention_heads": 16,
     "num_hidden_layers": 26,

 {
   "_name_or_path": "/raid/raushan/si-7b",
   "architectures": [
+    "LlavaOnevisionForConditionalGeneration"
   ],
   "ignore_index": -100,
   "image_grid_pinpoints": [
     ]
   ],
   "image_token_index": 151646,
+  "model_type": "llava_onevision",
   "projector_hidden_act": "gelu",
   "text_config": {
     "_name_or_path": "Qwen/Qwen2-7B-Instruct",
     "eos_token_id": 151645,
     "hidden_size": 3584,
     "intermediate_size": 18944,
     "model_type": "qwen2",
     "num_attention_heads": 28,
     "num_hidden_layers": 28,
     "num_key_value_heads": 4,
     "rope_theta": 1000000.0,
     "torch_dtype": "bfloat16",
     "vocab_size": 152128
   },
   "tie_word_embeddings": false,
   "torch_dtype": "float16",
   "transformers_version": "4.45.0.dev0",
   "use_image_newline_parameter": true,
+  "video_token_index": 151647,
   "vision_aspect_ratio": "anyres_max_9",
   "vision_config": {
     "hidden_size": 1152,
     "image_size": 384,
     "intermediate_size": 4304,
     "model_type": "siglip_vision_model",
     "num_attention_heads": 16,
     "num_hidden_layers": 26,

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d57730bab78c99ac80380558a932a70a646de5de9da5bf285cc38a97c53b8ad
 size 4911200360

 version https://git-lfs.github.com/spec/v1
+oid sha256:70b24c7c6a41076e26abbbff0f21ada8fa91f39ea9b79ff9b2fefb0c0321c890
 size 4911200360

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:407a356b29c7cbcb99c0a2952150fb1de322bb3cc688a6d42fdb0fc350596fa5
 size 1226266240

 version https://git-lfs.github.com/spec/v1
+oid sha256:d677a5896cb79d1a24927efd4f1b8eebdacce03943a38c1daca7bc3213091d75
 size 1226266240