| { | |
| "model_name": "STAR_Qwen2.5-7B_VQGAN", | |
| "model_type": "STARMultiModalityConfig", | |
| "language_model": { | |
| "model_name": "Qwen2.5-VL", | |
| "model_path": "checkpoints/Qwen2.5-VL-7B-Instruct" | |
| }, | |
| "pixel_encoder": { | |
| "model_name": "VQ_Model", | |
| "model_path": "checkpoints/VQ-Model.pt", | |
| "image_token_size": 65536, | |
| "n_embed": 512, | |
| "num_tokens": 576, | |
| "num_heads": 8 | |
| }, | |
| "pixel_adapter": { | |
| "model_name": "MLP_GELU", | |
| "depth": 4, | |
| "input_dim": 512, | |
| "n_embed": 3584 | |
| }, | |
| "stacked_ar": { | |
| "num_layers": 14 | |
| }, | |
| "pixel_output_head": { | |
| "image_token_embed": 4096, | |
| "image_token_size": 65536, | |
| "n_embed": 3584 | |
| }, | |
| "pixel_decoder": { | |
| "model_name": "LUMINA2", | |
| "model_path": "checkpoints/lumina-image2" | |
| }, | |
| "torch_dtype": "bfloat16" | |
| } | |