update config (#3)

- Track tokenizer.json with LFS (71413fe40951bad177f1d00c7b8b09decf87116c)
- Add tokenizer.json via LFS (5a65b912323b849b7c150099db8f881cdde50cba)
- update config (a8cb0eadabec912971979c0eb3ae25667adf84de)

Files changed (10) hide show

.gitattributes +1 -0
chat_template.json +3 -0
config.json +106 -0
configuration.json +1 -0
generation_config.json +12 -0
merges.txt +0 -0
preprocessor_config.json +19 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/logo.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/logo.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,106 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "vision_start_token_id": 151652,
+  "vision_end_token_id": 151653,
+  "vision_token_id": 151654,
+  "image_token_id": 151655,
+  "video_token_id": 151656,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.2",
+  "_attn_implementation": "flash_attention_2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vision_config": {
+    "depth": 32,
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "intermediate_size": 3420,
+    "num_heads": 16,
+    "in_chans": 3,
+    "out_hidden_size": 2048,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "window_size": 112,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "tokens_per_second": 2,
+    "temporal_patch_size": 2
+  },
+  "rope_scaling": {
+    "type": "mrope",
+    "mrope_section": [
+      16,
+      24,
+      24
+    ]
+  },
+  "vocab_size": 151936,
+  "num_experts": 2,
+  "experts":[
+    {
+      "hidden_size": 2048,
+      "intermediate_size": 11008,
+      "hidden_act": "silu"
+    },
+    {
+      "hidden_size": 2048,
+      "intermediate_size": 2048,
+      "hidden_act": "silu"
+    }
+  ],
+  "dof_config": {
+      "follow_left_ee_cartesian_pos": 3,
+      "follow_left_ee_rotation": 3,
+      "follow_left_gripper": 1,
+      "follow_right_ee_cartesian_pos": 3,
+      "follow_right_ee_rotation": 3,
+      "follow_right_gripper": 1,
+      "head_actions": 2,
+      "height": 1,
+      "car_pose": 3
+  },
+  "agent_pos_config": {
+    "follow_left_ee_cartesian_pos": 3,
+    "follow_left_ee_rotation": 3,
+    "follow_left_gripper": 1,
+    "follow_right_ee_cartesian_pos": 3,
+    "follow_right_ee_rotation": 3,
+    "follow_right_gripper": 1,
+    "head_actions": 2,
+    "height": 1,
+    "car_pose": 3
+  },
+  "noise_scheduler": {
+    "beta_alpha": 1.5,
+    "beta_beta": 1.0,
+    "s": 0.999,
+    "num_inference_timesteps": 5
+  },
+  "dim_inputs": [2048,2048],
+  "attention_moe": false,
+  "mlp_moe": true
+}

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework": "pytorch", "task": "vision-understanding", "allow_remote": true}

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "pad_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "repetition_penalty": 1.05,
+  "temperature": 0.000001,
+  "transformers_version": "4.49.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 12845056,
+  "patch_size": 14,
+  "temporal_patch_size": 2,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "processor_class": "Qwen2_5_VLProcessor"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a5df236d417e062783cda976a6c21955fe386a1dd8fb9aa06f29694a6d3a4de
+size 11826664

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff