trainml / app.py
Abhiroopvanaone's picture
Update app.py
10db99e verified
"""
CPU-Friendly Training Script for GLM-4.5V CAD Generation
Simplified version for testing and development
"""
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from PIL import Image
import json
import os
from dataclasses import dataclass
from typing import Dict, List
# Simple configuration for CPU testing
CONFIG = {
"base_model": "microsoft/DialoGPT-small", # Small model for CPU testing
"dataset_name": "CADCODER/GenCAD-Code",
"output_dir": "./test-cad-model",
"max_samples": 50, # Very small for CPU
"batch_size": 1,
"gradient_accumulation": 4,
"epochs": 1,
"learning_rate": 5e-5,
"max_length": 512
}
@dataclass
class SimpleDataCollator:
"""Simple data collator for text-only training."""
tokenizer: any
max_length: int = 512
def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
# Extract texts
texts = [f["text"] for f in features]
# Tokenize
batch = self.tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=self.max_length
)
# Create labels for causal LM
batch["labels"] = batch["input_ids"].clone()
batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100
return batch
def prepare_simple_dataset(dataset_name: str, max_samples: int = 50):
"""Prepare a simplified text-only dataset for CPU training."""
print(f"πŸ“Š Loading dataset: {dataset_name}")
try:
# Load small subset
dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]")
def create_text_examples(examples):
"""Convert to text-only format."""
texts = []
for i in range(len(examples["code"])):
# Create simple prompt-response format
text = f"Generate CADQuery code:\n{examples['code'][i]}<|endoftext|>"
texts.append(text)
return {"text": texts}
# Process dataset
dataset = dataset.map(
create_text_examples,
batched=True,
remove_columns=dataset.column_names
)
print(f"βœ… Dataset prepared: {len(dataset)} samples")
return dataset
except Exception as e:
print(f"❌ Dataset loading failed: {e}")
# Create dummy dataset for testing
print("πŸ”„ Creating dummy dataset for testing...")
dummy_codes = [
"import cadquery as cq\nresult = cq.Workplane('XY').box(10, 10, 5)",
"import cadquery as cq\nresult = cq.Workplane('XY').cylinder(5, 10)",
"import cadquery as cq\nresult = cq.Workplane('XY').box(20, 15, 8).fillet(2)",
]
texts = [f"Generate CADQuery code:\n{code}<|endoftext|>" for code in dummy_codes]
from datasets import Dataset
dataset = Dataset.from_dict({"text": texts * (max_samples // 3 + 1)})
dataset = dataset.select(range(max_samples))
print(f"βœ… Dummy dataset created: {len(dataset)} samples")
return dataset
def setup_simple_model(model_name: str):
"""Set up a simple model for CPU training."""
print(f"πŸ”§ Loading model: {model_name}")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add pad token if missing
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model for CPU
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32, # Use float32 for CPU
device_map="cpu"
)
# Simple LoRA config for CPU
lora_config = LoraConfig(
r=8, # Small rank for CPU
lora_alpha=16,
lora_dropout=0.1,
bias="none",
task_type=TaskType.CAUSAL_LM,
target_modules=["c_attn", "c_proj"] # DialoGPT modules
)
# Apply LoRA
model = get_peft_model(model, lora_config)
# Print parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"πŸ’‘ Trainable: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
return model, tokenizer
def train_simple_model(model, tokenizer, dataset, config):
"""Train the model with simple settings."""
print("πŸ‹οΈ Starting CPU training...")
# Training arguments for CPU
training_args = TrainingArguments(
output_dir=config["output_dir"],
per_device_train_batch_size=config["batch_size"],
gradient_accumulation_steps=config["gradient_accumulation"],
num_train_epochs=config["epochs"],
learning_rate=config["learning_rate"],
warmup_steps=10,
logging_steps=5,
save_steps=100,
evaluation_strategy="no",
save_total_limit=1,
remove_unused_columns=False,
report_to="none",
fp16=False, # No FP16 on CPU
dataloader_pin_memory=False,
use_cpu=True
)
# Data collator
data_collator = SimpleDataCollator(
tokenizer=tokenizer,
max_length=config["max_length"]
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
# Train
print("⏳ Training will take a few minutes on CPU...")
trainer.train()
# Save
trainer.save_model()
tokenizer.save_pretrained(config["output_dir"])
print(f"βœ… Training complete! Model saved to {config['output_dir']}")
return trainer
def test_simple_model(model_path: str):
"""Test the trained model."""
print(f"πŸ§ͺ Testing model: {model_path}")
try:
# Load model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
# Test generation
prompt = "Generate CADQuery code:"
inputs = tokenizer.encode(prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
inputs,
max_new_tokens=100,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("🎯 Generated:")
print(generated)
return generated
except Exception as e:
print(f"❌ Testing failed: {e}")
return str(e)
def main():
"""Main training pipeline for CPU."""
print("πŸš€ Starting CPU Training Pipeline")
print("=" * 50)
try:
# 1. Prepare dataset
print("\nπŸ“Š Step 1: Preparing dataset...")
dataset = prepare_simple_dataset(CONFIG["dataset_name"], CONFIG["max_samples"])
# 2. Setup model
print("\nπŸ”§ Step 2: Setting up model...")
model, tokenizer = setup_simple_model(CONFIG["base_model"])
# 3. Train
print("\nπŸ‹οΈ Step 3: Training...")
trainer = train_simple_model(model, tokenizer, dataset, CONFIG)
# 4. Test
print("\nπŸ§ͺ Step 4: Testing...")
test_simple_model(CONFIG["output_dir"])
print("\nπŸŽ‰ Pipeline complete!")
print(f"Model saved to: {CONFIG['output_dir']}")
return True
except Exception as e:
print(f"\n❌ Pipeline failed: {e}")
return False
if __name__ == "__main__":
success = main()
if success:
print("\nπŸ“ Next steps:")
print("1. Check the generated model in ./test-cad-model/")
print("2. Run test_simple_model() to generate more examples")
print("3. Once working, move to GPU version")
else:
print("\nπŸ”§ Troubleshooting:")
print("1. Check internet connection for dataset download")
print("2. Ensure you have enough disk space")
print("3. Try reducing max_samples to 10")