Spaces:

mindchain
/

rlm-arithmetic-training

Runtime error

App Files Files Community

rlm-arithmetic-training / train_arithmetic_v2.py

mindchain

Upload train_arithmetic_v2.py with huggingface_hub

74c1152 verified 26 days ago

raw

history blame contribute delete

9.77 kB

	#!/usr/bin/env python3
	"""
	GRPO + RLVR Training for Simple Arithmetic - v2
	Task: 2-digit addition and subtraction
	Base Model: Qwen/Qwen3-0.6B-Base

	Improvements:
	- Better reward function with debugging
	- Force EOS token in generation
	- Per-step evaluation
	- Clear tracking metrics
	"""

	import os
	import re
	import random
	import torch
	from datasets import Dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from trl import GRPOConfig, GRPOTrainer

	# ============================================================================
	# CONFIG
	# ============================================================================

	BASE_MODEL = "Qwen/Qwen3-0.6B-Base"
	OUTPUT_MODEL = "mindchain/qwen3-0.6b-arithmetic-v2"
	MAX_STEPS = 20
	NUM_SAMPLES = 500
	EVAL_SAMPLES = 20
	EVAL_EVERY = 5 # Evaluate every N steps

	# ============================================================================
	# DATA GENERATION
	# ============================================================================

	def generate_arithmetic_samples(n_samples):
	"""Generate simple arithmetic problems"""
	samples = []
	for _ in range(n_samples):
	op = random.choice(['+', '-'])

	if op == '+':
	a = random.randint(10, 99)
	b = random.randint(10, 99)
	answer = a + b
	problem = f"{a} + {b} = ?"
	else:
	a = random.randint(20, 99)
	b = random.randint(10, a-1)
	answer = a - b
	problem = f"{a} - {b} = ?"

	samples.append({
	'prompt': f"Solve: {problem}\nAnswer:",
	'answer': str(answer),
	'ground_truth': str(answer), # Also provide ground_truth for GRPO
	})

	return samples

	# ============================================================================
	# REWARD FUNCTION (with debugging)
	# ============================================================================

	def reward_func(completions, prompts=None, **kwargs):
	"""
	Reward function for arithmetic with debugging.
	"""
	# Try multiple column names for ground truth
	answers = None
	for key in ['answer', 'ground_truth', 'solution', 'label']:
	if key in kwargs and kwargs[key] is not None:
	answers = kwargs[key]
	break

	if answers is None:
	print("⚠️ WARNING: No ground truth found in kwargs!")
	print(f" Available keys: {list(kwargs.keys())}")
	return [0.0] * len(completions)

	rewards = []
	debug_samples = min(2, len(completions)) # Debug first 2 samples

	for i, (completion, truth) in enumerate(zip(completions, answers)):
	# Handle list format (conversational)
	if isinstance(completion, list):
	text = " ".join([m.get('content', '') if isinstance(m, dict) else str(m) for m in completion])
	else:
	text = str(completion)

	# Extract the last number
	numbers = re.findall(r'-?\d+\.?\d*', text)
	if numbers:
	predicted = numbers[-1].strip()
	else:
	predicted = ""

	# Exact match reward
	is_correct = predicted == str(truth).strip()
	rewards.append(1.0 if is_correct else 0.0)

	# Debug first few samples
	if i < debug_samples:
	status = "✅" if is_correct else "❌"
	print(f" [{i+1}] {status} Truth={truth} \| Pred={predicted} \| Text={text[:80]}...")

	return rewards

	# ============================================================================
	# EVALUATION
	# ============================================================================

	def evaluate_model(model, tokenizer, n_samples=EVAL_SAMPLES, step=0):
	"""Evaluate model performance"""
	print(f"\n{'='*70}")
	print(f"📊 EVALUATION @ Step {step}")
	print(f"{'='*70}")

	test_samples = generate_arithmetic_samples(n_samples)
	correct = 0

	model.eval()
	with torch.no_grad():
	for i, sample in enumerate(test_samples):
	inputs = tokenizer(sample['prompt'], return_tensors='pt')

	if hasattr(model, 'device') and model.device is not None:
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	outputs = model.generate(
	**inputs,
	max_new_tokens=30,
	do_sample=False,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)

	input_ids = inputs.get('input_ids')
	if input_ids is not None and hasattr(input_ids, 'shape'):
	response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
	else:
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract answer
	numbers = re.findall(r'-?\d+\.?\d*', response)
	predicted = numbers[-1].strip() if numbers else ""
	truth = sample['answer'].strip()

	is_correct = predicted == truth
	if is_correct:
	correct += 1

	status = "✅" if is_correct else "❌"
	print(f"[{i+1}] {status} {truth} \| Pred: {predicted} \| {response[:40]}...")

	accuracy = correct / n_samples * 100
	print(f"\n📊 Accuracy: {accuracy:.1f}% ({correct}/{n_samples})")
	print(f"{'='*70}\n")

	model.train()
	return accuracy

	# ============================================================================
	# CALLBACK FOR PER-STEP EVAL
	# ============================================================================

	from transformers import TrainerCallback

	class EvalCallback(TrainerCallback):
	def __init__(self, model, tokenizer, eval_every=EVAL_EVERY):
	self.model = model
	self.tokenizer = tokenizer
	self.eval_every = eval_every
	self.accuracies = []

	def on_step_end(self, args, state, control, **kwargs):
	if state.global_step > 0 and state.global_step % self.eval_every == 0:
	acc = evaluate_model(self.model, self.tokenizer, step=state.global_step)
	self.accuracies.append((state.global_step, acc))

	# Print summary
	print(f"\n📈 Progress Summary:")
	for step, accuracy in self.accuracies:
	print(f" Step {step}: {accuracy:.1f}%")
	print()

	# ============================================================================
	# MAIN TRAINING
	# ============================================================================

	def main():
	print("="*70)
	print("🔢 GRPO + RLVR Arithmetic Training - v2")
	print("="*70)
	print(f"Base Model: {BASE_MODEL}")
	print(f"Output: {OUTPUT_MODEL}")
	print(f"Steps: {MAX_STEPS}")
	print(f"Eval every: {EVAL_EVERY} steps")
	print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
	print("="*70 + "\n")

	# Load model and tokenizer
	print("📦 Loading model and tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

	# Ensure pad token is set
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	print(f" Set pad_token to eos_token: {tokenizer.eos_token}")

	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	)

	# Resize embeddings if needed
	model.resize_token_embeddings(len(tokenizer))

	# Initial evaluation
	initial_acc = evaluate_model(model, tokenizer, step=0)

	# Generate training data
	print("📊 Generating training data...")
	train_samples = generate_arithmetic_samples(NUM_SAMPLES)
	train_dataset = Dataset.from_list(train_samples)
	print(f"✅ {len(train_dataset)} training samples\n")

	# GRPO Config
	is_cpu = not torch.cuda.is_available()
	training_args = GRPOConfig(
	output_dir="./outputs",
	max_steps=MAX_STEPS,
	per_device_train_batch_size=2,
	num_generations=2,
	learning_rate=2e-4,
	beta=0.0, # No KL penalty for arithmetic
	bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
	fp16=False,
	gradient_checkpointing=not is_cpu,
	optim="adamw_torch" if is_cpu else "adamw_8bit",
	logging_steps=1,
	save_steps=MAX_STEPS,
	push_to_hub=False,
	report_to="none",
	)

	# Eval callback
	eval_callback = EvalCallback(model, tokenizer, eval_every=EVAL_EVERY)

	print("🚀 Starting GRPO Training...")
	print(f"Initial accuracy: {initial_acc:.1f}%\n")

	# Train
	trainer = GRPOTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	reward_funcs=[reward_func],
	callbacks=[eval_callback],
	)

	trainer.train()

	# Final evaluation
	final_acc = evaluate_model(model, tokenizer, step=MAX_STEPS)

	# Summary
	print("\n" + "="*70)
	print("📊 FINAL RESULTS")
	print("="*70)
	print(f"Initial Accuracy: {initial_acc:.1f}%")
	print(f"Final Accuracy: {final_acc:.1f}%")
	print(f"Improvement: {final_acc - initial_acc:+.1f}%")
	print()
	print("📈 Training Progress:")
	for step, acc in eval_callback.accuracies:
	print(f" Step {step}: {acc:.1f}%")
	print("="*70)

	# Save to Hub
	print(f"\n📦 Pushing to Hub: {OUTPUT_MODEL}")
	trainer.model.push_to_hub(OUTPUT_MODEL)
	tokenizer.push_to_hub(OUTPUT_MODEL)
	print(f"✅ Model pushed to: https://huggingface.co/{OUTPUT_MODEL}")

	if __name__ == "__main__":
	main()