Mindigenous
/

MINDI-1.5-Vision-Coder

@@ -1,113 +1,81 @@
-# ==========================================
-# MINDI 1.5 Vision-Coder — Training Configuration
-# Optimized for AMD MI300X 192GB VRAM
-# ==========================================
-# ── Model ──────────────────────────────────────────────────────
-model:
-  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
-  hidden_size: 3584
-  dtype: "bf16"                   # bf16 required for MI300X stability (NOT fp16)
-  use_compile: false              # Disabled — inductor eats ~130GB VRAM on ROCm
-  gradient_checkpointing: true    # Save VRAM even with 192GB
-# ── LoRA ───────────────────────────────────────────────────────
 lora:
-  r: 64
   alpha: 128
   dropout: 0.05
-  bias: "none"
-  task_type: "CAUSAL_LM"
   target_modules:
-    - q_proj
-    - k_proj
-    - v_proj
-    - o_proj
-    - gate_proj
-    - up_proj
-    - down_proj
-# ── Vision ─────────────────────────────────────────────────────
-vision:
-  clip_model: "openai/clip-vit-large-patch14"
-  visual_tokens: 256              # 16×16 patches from ViT-L/14
-  projection_size: 3584           # Must match model.hidden_size
-  freeze_clip: true               # Freeze CLIP backbone
-# ── Training Phases ────────────────────────────────────────────
 training:
-  # Phase 1: LoRA only — teach coding patterns
   phase1:
     steps: 5000
-    lr: 2.0e-4
-    batch_size: 8                 # Reduced from 16 (OOM with compile+logits)
     warmup_steps: 100
-    data_filter: "code_only"
-  # Phase 2: Vision bridge only — align visual tokens
   phase2:
     steps: 2500
-    lr: 1.0e-5
-    batch_size: 4                 # Reduced from 8 (vision needs more VRAM)
     warmup_steps: 50
-    data_filter: "websight_only"
-  # Phase 3: All trainable — joint fine-tuning
   phase3:
     steps: 2500
-    lr: 5.0e-5
-    batch_size: 6                 # Reduced from 12
     warmup_steps: 50
-    data_filter: "all"
-  # Shared training settings
-  grad_accumulation: 8            # Doubled from 4 to keep effective batch size
-  max_grad_norm: 1.0
-  eval_every: 250
   save_every: 500
-# ── Data ───────────────────────────────────────────────────────
-data:
-  # Text-only code data (Phase 1 + Phase 3)
-  train_file: "data/processed/train.jsonl"     # 4.18GB, 1,304,486 examples
-  val_file: "data/processed/val.jsonl"         # 0.23GB, 72,471 examples
-  # Vision+code data — WebSight UI screenshots (Phase 2 + Phase 3)
-  vision_train_file: "data/websight/train.jsonl"
-  vision_val_file: "data/websight/val.jsonl"
-  max_length: 4096
-  shuffle_buffer: 10000           # Streaming shuffle buffer size
-  num_workers: 4                  # DataLoader workers
-  pin_memory: true
-  prefetch_factor: 2
-# ── Logging ────────────────────────────────────────────────────
-logging:
-  wandb_project: "mindi-1.5-vision-coder"
-  wandb_entity: "mindigenous"
-  log_every: 10                   # Log metrics every N steps
-  log_dir: "logs/training"
-  sample_every: 500               # Generate sample outputs every N steps
-  tags:
-    - "mindi-1.5"
-    - "lora"
-    - "vision-coder"
-    - "mi300x"
-# ── Output ─────────────────────────────────────────────────────
-output:
-  checkpoint_dir: "checkpoints/training"
-  best_model: "checkpoints/best"
-  hf_repo: "Mindigenous/MINDI-1.5-Vision-Coder"
-  push_every_phase: true
-# ── Local Dev Overrides (RTX 4060 8GB) ────────────────────────
-# Apply these when testing locally with --dry_run
-local_overrides:
-  batch_size: 1
-  gradient_accumulation_steps: 16
-  max_length: 2048
-  gradient_checkpointing: true
-  use_compile: false
-  num_workers: 0

+data:
+  max_length: 2048
+  num_workers: 2
+  pin_memory: true
+  prefetch_factor: 2
+  shuffle_buffer: 10000
+  train_file: data/processed/train.jsonl
+  val_file: data/processed/val.jsonl
+  vision_train_file: data/websight/train.jsonl
+  vision_val_file: data/websight/val.jsonl
+local_overrides:
+  batch_size: 1
+  gradient_accumulation_steps: 16
+  gradient_checkpointing: true
+  max_length: 2048
+  num_workers: 0
+  use_compile: false
+logging:
+  log_dir: logs/training
+  log_every: 10
+  sample_every: 500
+  tags:
+  - mindi-1.5
+  - lora
+  - vision-coder
+  - mi300x
+  wandb_entity: mindigenous
+  wandb_project: mindi-1.5-vision-coder
 lora:
   alpha: 128
+  bias: none
   dropout: 0.05
+  r: 64
   target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+  task_type: CAUSAL_LM
+model:
+  dtype: bf16
+  gradient_checkpointing: true
+  hidden_size: 3584
+  name: Qwen/Qwen2.5-Coder-7B-Instruct
+  use_compile: false
+output:
+  best_model: checkpoints/best
+  checkpoint_dir: checkpoints/training
+  hf_repo: Mindigenous/MINDI-1.5-Vision-Coder
+  push_every_phase: true
 training:
+  eval_every: 250
+  grad_accumulation: 24
+  max_grad_norm: 1.0
   phase1:
+    batch_size: 2
+    data_filter: code_only
+    lr: 0.0002
     steps: 5000
     warmup_steps: 100
   phase2:
+    batch_size: 2
+    data_filter: websight_only
+    lr: 1.0e-05
     steps: 2500
     warmup_steps: 50
   phase3:
+    batch_size: 2
+    data_filter: all
+    lr: 5.0e-05
     steps: 2500
     warmup_steps: 50
   save_every: 500
+vision:
+  clip_model: openai/clip-vit-large-patch14
+  freeze_clip: true
+  projection_size: 3584
+  visual_tokens: 256