YUNTA88
/

rl4phyx-backup

Safetensors

Model card Files Files and versions

xet

Community

YUNTA88 commited on Mar 26

Commit

36ec0aa

verified ·

1 Parent(s): 15fa009

Upload scripts/train_sft.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/train_sft.py +301 -0

scripts/train_sft.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""
+SFT Training Script for Qwen2.5-VL-3B-Instruct on Physics CoT Data.
+Aligned with RL-with-Cold-Start 7B reference configuration.
+Key changes from previous version:
+  - Full fine-tuning (no LoRA) for stronger cold-start
+  - Vision encoder NOT frozen (freeze_aligner=false in reference)
+  - 3 epochs (not 16) to avoid overfitting
+  - Higher image resolution (max_pixels=1204224) matching reference
+  - Larger effective batch size (grad_accum=16)
+  - DeepSpeed ZeRO-2 for memory efficiency
+  - Lower learning rate (1e-5) appropriate for full FT
+"""
+import os
+import json
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    TrainingArguments,
+    Trainer,
+)
+# ===== Configuration =====
+MODEL_NAME = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+DATA_PATH = "/workspace/rl4phyx/RL4Phyx/SFT/sft_train/coldstart_formatted.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft"
+# Training hyperparameters (aligned with 7B reference)
+NUM_EPOCHS = 3               # Reference uses 3 epochs
+LEARNING_RATE = 1e-5         # Full FT uses lower LR than LoRA
+PER_DEVICE_BATCH_SIZE = 1    # Small batch for VLM
+GRAD_ACCUM_STEPS = 8        # Effective batch = 1 * 8 GPUs * 8 = 64
+MAX_LENGTH = 4096            # Max total sequence length
+FREEZE_VISION = False        # Reference: freeze_aligner=false
+class PhysicsCoTDataset(Dataset):
+    """Dataset for Qwen2.5-VL SFT with physics CoT."""
+    def __init__(self, data_path, processor, max_length=4096):
+        self.processor = processor
+        self.max_length = max_length
+        with open(data_path, 'r', encoding='utf-8') as f:
+            self.records = [json.loads(line) for line in f]
+        print(f"Loaded {len(self.records)} records from {data_path}")
+    def __len__(self):
+        return len(self.records)
+    def __getitem__(self, idx):
+        record = self.records[idx]
+        messages = record['messages']
+        # Extract image path from user message
+        user_msg = messages[0]
+        image_path = None
+        text_content = ""
+        for content in user_msg['content']:
+            if content['type'] == 'image':
+                image_path = content['image'].replace('file://', '')
+            elif content['type'] == 'text':
+                text_content = content['text']
+        # Extract assistant response
+        assistant_msg = messages[1]
+        assistant_text = assistant_msg['content'][0]['text']
+        # Load image
+        image = Image.open(image_path).convert('RGB')
+        # Ensure minimum image size for Qwen2.5-VL vision encoder (factor=28)
+        # Strategy: scale up proportionally (preserve aspect ratio), then pad with white
+        MIN_DIM = 56  # Must be >= 28, use 56 for safety (2*factor)
+        w, h = image.size
+        if w < MIN_DIM or h < MIN_DIM:
+            # Scale proportionally so the smaller dimension reaches MIN_DIM
+            scale = max(MIN_DIM / w, MIN_DIM / h)
+            new_w = int(w * scale)
+            new_h = int(h * scale)
+            image = image.resize((new_w, new_h), Image.LANCZOS)
+            # Pad with white if any dimension still < MIN_DIM (shouldn't happen, but safety)
+            if new_w < MIN_DIM or new_h < MIN_DIM:
+                from PIL import ImageOps
+                padded = Image.new('RGB', (max(new_w, MIN_DIM), max(new_h, MIN_DIM)), (255, 255, 255))
+                padded.paste(image, (0, 0))
+                image = padded
+        # Build conversation for apply_chat_template
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": text_content},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": assistant_text},
+                ],
+            },
+        ]
+        # Use processor to create inputs
+        text = self.processor.apply_chat_template(
+            conversation,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+        inputs = self.processor(
+            text=[text],
+            images=[image],
+            padding=False,
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        # Squeeze batch dimension
+        input_ids = inputs['input_ids'].squeeze(0)
+        attention_mask = inputs['attention_mask'].squeeze(0)
+        # Create labels: mask user tokens (only train on assistant response)
+        labels = input_ids.clone()
+        # Find the assistant turn start token and mask everything before it
+        assistant_token_str = "<|im_start|>assistant\n"
+        assistant_token_ids = self.processor.tokenizer.encode(
+            assistant_token_str, add_special_tokens=False
+        )
+        input_ids_list = input_ids.tolist()
+        assistant_start = -1
+        for i in range(len(input_ids_list) - len(assistant_token_ids) + 1):
+            if input_ids_list[i:i + len(assistant_token_ids)] == assistant_token_ids:
+                assistant_start = i + len(assistant_token_ids)
+                break
+        if assistant_start > 0:
+            labels[:assistant_start] = -100  # Mask user prompt
+        else:
+            raise ValueError(f"FATAL: assistant start token not found in sample {idx}.")
+        # Also mask padding
+        labels[attention_mask == 0] = -100
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': labels,
+            'pixel_values': inputs.get('pixel_values', torch.tensor([])).squeeze(0) if 'pixel_values' in inputs else None,
+            'image_grid_thw': inputs.get('image_grid_thw', torch.tensor([])).squeeze(0) if 'image_grid_thw' in inputs else None,
+        }
+class VLMDataCollator:
+    """Custom data collator for variable-length VLM inputs."""
+    def __init__(self, processor):
+        self.processor = processor
+        self.pad_token_id = processor.tokenizer.pad_token_id or processor.tokenizer.eos_token_id
+    def __call__(self, features):
+        max_len = max(f['input_ids'].size(0) for f in features)
+        input_ids = []
+        attention_mask = []
+        labels = []
+        pixel_values = []
+        image_grid_thw = []
+        for f in features:
+            seq_len = f['input_ids'].size(0)
+            pad_len = max_len - seq_len
+            input_ids.append(torch.cat([
+                f['input_ids'],
+                torch.full((pad_len,), self.pad_token_id, dtype=f['input_ids'].dtype)
+            ]))
+            attention_mask.append(torch.cat([
+                f['attention_mask'],
+                torch.zeros(pad_len, dtype=f['attention_mask'].dtype)
+            ]))
+            labels.append(torch.cat([
+                f['labels'],
+                torch.full((pad_len,), -100, dtype=f['labels'].dtype)
+            ]))
+            if f.get('pixel_values') is not None:
+                pixel_values.append(f['pixel_values'])
+            if f.get('image_grid_thw') is not None:
+                image_grid_thw.append(f['image_grid_thw'])
+        batch = {
+            'input_ids': torch.stack(input_ids),
+            'attention_mask': torch.stack(attention_mask),
+            'labels': torch.stack(labels),
+        }
+        if pixel_values:
+            batch['pixel_values'] = torch.cat(pixel_values, dim=0)
+        if image_grid_thw:
+            batch['image_grid_thw'] = torch.stack(image_grid_thw)
+        return batch
+def main():
+    print(f"Loading model: {MODEL_NAME}")
+    print(f"Data: {DATA_PATH}")
+    print(f"Output: {OUTPUT_DIR}")
+    print(f"Full FT (no LoRA), Freeze Vision: {FREEZE_VISION}")
+    print(f"Epochs: {NUM_EPOCHS}, LR: {LEARNING_RATE}, Batch: {PER_DEVICE_BATCH_SIZE} x {GRAD_ACCUM_STEPS}")
+    # Load processor (higher resolution matching 7B reference)
+    processor = AutoProcessor.from_pretrained(
+        MODEL_NAME,
+        min_pixels=3136,          # 56x56
+        max_pixels=1204224,       # ~1100x1100, matching reference MAX_PIXELS
+    )
+    # Load model
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="sdpa",
+    )
+    # Vision encoder: NOT frozen (matching reference freeze_aligner=false)
+    if FREEZE_VISION:
+        for name, param in model.named_parameters():
+            if 'visual' in name:
+                param.requires_grad = False
+        print("Froze vision encoder parameters")
+    else:
+        print("Vision encoder is trainable (matching 7B reference)")
+    # Full fine-tuning: enable input grads for gradient checkpointing
+    model.enable_input_require_grads()
+    # Create dataset
+    dataset = PhysicsCoTDataset(data_path=DATA_PATH, processor=processor, max_length=MAX_LENGTH)
+    # Training arguments
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=NUM_EPOCHS,
+        per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
+        learning_rate=LEARNING_RATE,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.03,             # Matching reference
+        weight_decay=0.01,
+        bf16=True,
+        logging_steps=10,
+        save_strategy="steps",
+        save_steps=20,                 # Matching reference
+        save_total_limit=2,            # Matching reference
+        eval_steps=20,                 # Matching reference
+        dataloader_num_workers=4,
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={'use_reentrant': False},
+        remove_unused_columns=False,
+        report_to="none",
+        deepspeed="ds_zero2.json",     # DeepSpeed ZeRO-2 for full FT
+        save_only_model=True,          # Matching reference
+    )
+    # Collator
+    collator = VLMDataCollator(processor)
+    # Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        data_collator=collator,
+    )
+    # Train
+    print("\n===== Starting SFT Training (Full FT, aligned with 7B reference) =====")
+    trainer.train()
+    # Save final model
+    print("\n===== Saving final model =====")
+    trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
+    processor.save_pretrained(os.path.join(OUTPUT_DIR, "final"))
+    print(f"Final model saved to: {os.path.join(OUTPUT_DIR, 'final')}")
+    print("\n===== SFT Training Complete =====")
+if __name__ == "__main__":
+    main()