Aekanun
/

thai-handwriting-llm

@@ -1,25 +1,36 @@
 {
-  "model_type": "llama",
   "architectures": [
-    "LlamaForVisionToText"
   ],
   "_name_or_path": "meta-llama/Llama-3.2-11B-Vision-Instruct",
   "torch_dtype": "bfloat16",
   "transformers_version": "4.36.0",
-  "use_cache": true,
-  "tie_word_embeddings": false,
-  "vision_config": {
-    "hidden_size": 1024,
-    "image_size": 224,
-    "patch_size": 14,
-    "num_hidden_layers": 24,
-    "num_attention_heads": 16
-  },
   "text_config": {
     "hidden_size": 4096,
-    "intermediate_size": 11008,
-    "num_hidden_layers": 32,
     "num_attention_heads": 32,
-    "vocab_size": 32000
   }
 }

 {
+  "model_type": "mllama",
   "architectures": [
+    "MllamaForConditionalGeneration"
   ],
   "_name_or_path": "meta-llama/Llama-3.2-11B-Vision-Instruct",
   "torch_dtype": "bfloat16",
   "transformers_version": "4.36.0",
+  "image_token_index": 128256,
   "text_config": {
+    "model_type": "mllama_text_model",
     "hidden_size": 4096,
+    "intermediate_size": 14336,
     "num_attention_heads": 32,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 8,
+    "hidden_act": "silu",
+    "max_position_embeddings": 131072,
+    "rms_norm_eps": 1e-05,
+    "vocab_size": 128256,
+    "torch_dtype": "bfloat16"
+  },
+  "vision_config": {
+    "model_type": "mllama_vision_model",
+    "hidden_size": 1280,
+    "intermediate_size": 5120,
+    "num_hidden_layers": 32,
+    "num_attention_heads": 16,
+    "hidden_act": "gelu",
+    "image_size": 560,
+    "patch_size": 14,
+    "num_channels": 3,
+    "norm_eps": 1e-05,
+    "vision_output_dim": 7680
   }
 }