Spaces:

Abhiroopvanaone
/

trainml

Runtime error

App Files Files Community

trainml / app.py

Abhiroopvanaone

Update app.py

10db99e verified 5 months ago

raw

history blame contribute delete

8.42 kB

	"""
	CPU-Friendly Training Script for GLM-4.5V CAD Generation
	Simplified version for testing and development
	"""

	import torch
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
	from peft import LoraConfig, get_peft_model, TaskType
	from PIL import Image
	import json
	import os
	from dataclasses import dataclass
	from typing import Dict, List

	# Simple configuration for CPU testing
	CONFIG = {
	"base_model": "microsoft/DialoGPT-small", # Small model for CPU testing
	"dataset_name": "CADCODER/GenCAD-Code",
	"output_dir": "./test-cad-model",
	"max_samples": 50, # Very small for CPU
	"batch_size": 1,
	"gradient_accumulation": 4,
	"epochs": 1,
	"learning_rate": 5e-5,
	"max_length": 512
	}

	@dataclass
	class SimpleDataCollator:
	"""Simple data collator for text-only training."""
	tokenizer: any
	max_length: int = 512

	def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
	# Extract texts
	texts = [f["text"] for f in features]

	# Tokenize
	batch = self.tokenizer(
	texts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=self.max_length
	)

	# Create labels for causal LM
	batch["labels"] = batch["input_ids"].clone()
	batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100

	return batch

	def prepare_simple_dataset(dataset_name: str, max_samples: int = 50):
	"""Prepare a simplified text-only dataset for CPU training."""
	print(f"📊 Loading dataset: {dataset_name}")

	try:
	# Load small subset
	dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]")

	def create_text_examples(examples):
	"""Convert to text-only format."""
	texts = []

	for i in range(len(examples["code"])):
	# Create simple prompt-response format
	text = f"Generate CADQuery code:\n{examples['code'][i]}<\|endoftext\|>"
	texts.append(text)

	return {"text": texts}

	# Process dataset
	dataset = dataset.map(
	create_text_examples,
	batched=True,
	remove_columns=dataset.column_names
	)

	print(f"✅ Dataset prepared: {len(dataset)} samples")
	return dataset

	except Exception as e:
	print(f"❌ Dataset loading failed: {e}")

	# Create dummy dataset for testing
	print("🔄 Creating dummy dataset for testing...")
	dummy_codes = [
	"import cadquery as cq\nresult = cq.Workplane('XY').box(10, 10, 5)",
	"import cadquery as cq\nresult = cq.Workplane('XY').cylinder(5, 10)",
	"import cadquery as cq\nresult = cq.Workplane('XY').box(20, 15, 8).fillet(2)",
	]

	texts = [f"Generate CADQuery code:\n{code}<\|endoftext\|>" for code in dummy_codes]

	from datasets import Dataset
	dataset = Dataset.from_dict({"text": texts * (max_samples // 3 + 1)})
	dataset = dataset.select(range(max_samples))

	print(f"✅ Dummy dataset created: {len(dataset)} samples")
	return dataset

	def setup_simple_model(model_name: str):
	"""Set up a simple model for CPU training."""
	print(f"🔧 Loading model: {model_name}")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Add pad token if missing
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model for CPU
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32, # Use float32 for CPU
	device_map="cpu"
	)

	# Simple LoRA config for CPU
	lora_config = LoraConfig(
	r=8, # Small rank for CPU
	lora_alpha=16,
	lora_dropout=0.1,
	bias="none",
	task_type=TaskType.CAUSAL_LM,
	target_modules=["c_attn", "c_proj"] # DialoGPT modules
	)

	# Apply LoRA
	model = get_peft_model(model, lora_config)

	# Print parameters
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	total_params = sum(p.numel() for p in model.parameters())
	print(f"💡 Trainable: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")

	return model, tokenizer

	def train_simple_model(model, tokenizer, dataset, config):
	"""Train the model with simple settings."""
	print("🏋️ Starting CPU training...")

	# Training arguments for CPU
	training_args = TrainingArguments(
	output_dir=config["output_dir"],
	per_device_train_batch_size=config["batch_size"],
	gradient_accumulation_steps=config["gradient_accumulation"],
	num_train_epochs=config["epochs"],
	learning_rate=config["learning_rate"],
	warmup_steps=10,
	logging_steps=5,
	save_steps=100,
	evaluation_strategy="no",
	save_total_limit=1,
	remove_unused_columns=False,
	report_to="none",
	fp16=False, # No FP16 on CPU
	dataloader_pin_memory=False,
	use_cpu=True
	)

	# Data collator
	data_collator = SimpleDataCollator(
	tokenizer=tokenizer,
	max_length=config["max_length"]
	)

	# Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset,
	data_collator=data_collator,
	tokenizer=tokenizer
	)

	# Train
	print("⏳ Training will take a few minutes on CPU...")
	trainer.train()

	# Save
	trainer.save_model()
	tokenizer.save_pretrained(config["output_dir"])

	print(f"✅ Training complete! Model saved to {config['output_dir']}")
	return trainer

	def test_simple_model(model_path: str):
	"""Test the trained model."""
	print(f"🧪 Testing model: {model_path}")

	try:
	# Load model
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(model_path)

	# Test generation
	prompt = "Generate CADQuery code:"
	inputs = tokenizer.encode(prompt, return_tensors="pt")

	with torch.no_grad():
	outputs = model.generate(
	inputs,
	max_new_tokens=100,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

	print("🎯 Generated:")
	print(generated)
	return generated

	except Exception as e:
	print(f"❌ Testing failed: {e}")
	return str(e)

	def main():
	"""Main training pipeline for CPU."""
	print("🚀 Starting CPU Training Pipeline")
	print("=" * 50)

	try:
	# 1. Prepare dataset
	print("\n📊 Step 1: Preparing dataset...")
	dataset = prepare_simple_dataset(CONFIG["dataset_name"], CONFIG["max_samples"])

	# 2. Setup model
	print("\n🔧 Step 2: Setting up model...")
	model, tokenizer = setup_simple_model(CONFIG["base_model"])

	# 3. Train
	print("\n🏋️ Step 3: Training...")
	trainer = train_simple_model(model, tokenizer, dataset, CONFIG)

	# 4. Test
	print("\n🧪 Step 4: Testing...")
	test_simple_model(CONFIG["output_dir"])

	print("\n🎉 Pipeline complete!")
	print(f"Model saved to: {CONFIG['output_dir']}")

	return True

	except Exception as e:
	print(f"\n❌ Pipeline failed: {e}")
	return False

	if __name__ == "__main__":
	success = main()

	if success:
	print("\n📝 Next steps:")
	print("1. Check the generated model in ./test-cad-model/")
	print("2. Run test_simple_model() to generate more examples")
	print("3. Once working, move to GPU version")
	else:
	print("\n🔧 Troubleshooting:")
	print("1. Check internet connection for dataset download")
	print("2. Ensure you have enough disk space")
	print("3. Try reducing max_samples to 10")