Spaces:
Runtime error
Runtime error
| """ | |
| CPU-Friendly Training Script for GLM-4.5V CAD Generation | |
| Simplified version for testing and development | |
| """ | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
| from peft import LoraConfig, get_peft_model, TaskType | |
| from PIL import Image | |
| import json | |
| import os | |
| from dataclasses import dataclass | |
| from typing import Dict, List | |
| # Simple configuration for CPU testing | |
| CONFIG = { | |
| "base_model": "microsoft/DialoGPT-small", # Small model for CPU testing | |
| "dataset_name": "CADCODER/GenCAD-Code", | |
| "output_dir": "./test-cad-model", | |
| "max_samples": 50, # Very small for CPU | |
| "batch_size": 1, | |
| "gradient_accumulation": 4, | |
| "epochs": 1, | |
| "learning_rate": 5e-5, | |
| "max_length": 512 | |
| } | |
| class SimpleDataCollator: | |
| """Simple data collator for text-only training.""" | |
| tokenizer: any | |
| max_length: int = 512 | |
| def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]: | |
| # Extract texts | |
| texts = [f["text"] for f in features] | |
| # Tokenize | |
| batch = self.tokenizer( | |
| texts, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=self.max_length | |
| ) | |
| # Create labels for causal LM | |
| batch["labels"] = batch["input_ids"].clone() | |
| batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100 | |
| return batch | |
| def prepare_simple_dataset(dataset_name: str, max_samples: int = 50): | |
| """Prepare a simplified text-only dataset for CPU training.""" | |
| print(f"π Loading dataset: {dataset_name}") | |
| try: | |
| # Load small subset | |
| dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]") | |
| def create_text_examples(examples): | |
| """Convert to text-only format.""" | |
| texts = [] | |
| for i in range(len(examples["code"])): | |
| # Create simple prompt-response format | |
| text = f"Generate CADQuery code:\n{examples['code'][i]}<|endoftext|>" | |
| texts.append(text) | |
| return {"text": texts} | |
| # Process dataset | |
| dataset = dataset.map( | |
| create_text_examples, | |
| batched=True, | |
| remove_columns=dataset.column_names | |
| ) | |
| print(f"β Dataset prepared: {len(dataset)} samples") | |
| return dataset | |
| except Exception as e: | |
| print(f"β Dataset loading failed: {e}") | |
| # Create dummy dataset for testing | |
| print("π Creating dummy dataset for testing...") | |
| dummy_codes = [ | |
| "import cadquery as cq\nresult = cq.Workplane('XY').box(10, 10, 5)", | |
| "import cadquery as cq\nresult = cq.Workplane('XY').cylinder(5, 10)", | |
| "import cadquery as cq\nresult = cq.Workplane('XY').box(20, 15, 8).fillet(2)", | |
| ] | |
| texts = [f"Generate CADQuery code:\n{code}<|endoftext|>" for code in dummy_codes] | |
| from datasets import Dataset | |
| dataset = Dataset.from_dict({"text": texts * (max_samples // 3 + 1)}) | |
| dataset = dataset.select(range(max_samples)) | |
| print(f"β Dummy dataset created: {len(dataset)} samples") | |
| return dataset | |
| def setup_simple_model(model_name: str): | |
| """Set up a simple model for CPU training.""" | |
| print(f"π§ Loading model: {model_name}") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Add pad token if missing | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model for CPU | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, # Use float32 for CPU | |
| device_map="cpu" | |
| ) | |
| # Simple LoRA config for CPU | |
| lora_config = LoraConfig( | |
| r=8, # Small rank for CPU | |
| lora_alpha=16, | |
| lora_dropout=0.1, | |
| bias="none", | |
| task_type=TaskType.CAUSAL_LM, | |
| target_modules=["c_attn", "c_proj"] # DialoGPT modules | |
| ) | |
| # Apply LoRA | |
| model = get_peft_model(model, lora_config) | |
| # Print parameters | |
| trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| total_params = sum(p.numel() for p in model.parameters()) | |
| print(f"π‘ Trainable: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)") | |
| return model, tokenizer | |
| def train_simple_model(model, tokenizer, dataset, config): | |
| """Train the model with simple settings.""" | |
| print("ποΈ Starting CPU training...") | |
| # Training arguments for CPU | |
| training_args = TrainingArguments( | |
| output_dir=config["output_dir"], | |
| per_device_train_batch_size=config["batch_size"], | |
| gradient_accumulation_steps=config["gradient_accumulation"], | |
| num_train_epochs=config["epochs"], | |
| learning_rate=config["learning_rate"], | |
| warmup_steps=10, | |
| logging_steps=5, | |
| save_steps=100, | |
| evaluation_strategy="no", | |
| save_total_limit=1, | |
| remove_unused_columns=False, | |
| report_to="none", | |
| fp16=False, # No FP16 on CPU | |
| dataloader_pin_memory=False, | |
| use_cpu=True | |
| ) | |
| # Data collator | |
| data_collator = SimpleDataCollator( | |
| tokenizer=tokenizer, | |
| max_length=config["max_length"] | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer | |
| ) | |
| # Train | |
| print("β³ Training will take a few minutes on CPU...") | |
| trainer.train() | |
| # Save | |
| trainer.save_model() | |
| tokenizer.save_pretrained(config["output_dir"]) | |
| print(f"β Training complete! Model saved to {config['output_dir']}") | |
| return trainer | |
| def test_simple_model(model_path: str): | |
| """Test the trained model.""" | |
| print(f"π§ͺ Testing model: {model_path}") | |
| try: | |
| # Load model | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForCausalLM.from_pretrained(model_path) | |
| # Test generation | |
| prompt = "Generate CADQuery code:" | |
| inputs = tokenizer.encode(prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| inputs, | |
| max_new_tokens=100, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| generated = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| print("π― Generated:") | |
| print(generated) | |
| return generated | |
| except Exception as e: | |
| print(f"β Testing failed: {e}") | |
| return str(e) | |
| def main(): | |
| """Main training pipeline for CPU.""" | |
| print("π Starting CPU Training Pipeline") | |
| print("=" * 50) | |
| try: | |
| # 1. Prepare dataset | |
| print("\nπ Step 1: Preparing dataset...") | |
| dataset = prepare_simple_dataset(CONFIG["dataset_name"], CONFIG["max_samples"]) | |
| # 2. Setup model | |
| print("\nπ§ Step 2: Setting up model...") | |
| model, tokenizer = setup_simple_model(CONFIG["base_model"]) | |
| # 3. Train | |
| print("\nποΈ Step 3: Training...") | |
| trainer = train_simple_model(model, tokenizer, dataset, CONFIG) | |
| # 4. Test | |
| print("\nπ§ͺ Step 4: Testing...") | |
| test_simple_model(CONFIG["output_dir"]) | |
| print("\nπ Pipeline complete!") | |
| print(f"Model saved to: {CONFIG['output_dir']}") | |
| return True | |
| except Exception as e: | |
| print(f"\nβ Pipeline failed: {e}") | |
| return False | |
| if __name__ == "__main__": | |
| success = main() | |
| if success: | |
| print("\nπ Next steps:") | |
| print("1. Check the generated model in ./test-cad-model/") | |
| print("2. Run test_simple_model() to generate more examples") | |
| print("3. Once working, move to GPU version") | |
| else: | |
| print("\nπ§ Troubleshooting:") | |
| print("1. Check internet connection for dataset download") | |
| print("2. Ensure you have enough disk space") | |
| print("3. Try reducing max_samples to 10") |