Update train.py

f3904a4 verified 3 days ago

11.5 kB

	# ============================================================
	# NanoCalc 1M (Mini Math Model) - T5 Seq2Seq
	# ============================================================
	# pip install transformers torch datasets accelerate

	import random
	import torch
	import numpy as np
	from torch.utils.data import Dataset, DataLoader
	from transformers import T5Config, T5ForConditionalGeneration
	from torch.optim import AdamW
	from torch.optim.lr_scheduler import CosineAnnealingLR
	import time

	# ============================================================
	# 1. CONFIG
	# ============================================================

	TRAIN_SAMPLES = 2_000_000
	VAL_SAMPLES = 10_000
	MAX_DIGITS = 3
	BATCH_SIZE = 512
	EPOCHS = 10
	LR = 3e-4
	MAX_INPUT_LEN = 20
	MAX_TARGET_LEN= 12
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	SAVE_PATH = "model.pt"

	print(f"Device: {DEVICE}")
	print(f"GPU: {torch.cuda.get_device_name(0) if DEVICE == 'cuda' else 'None'}")

	# ============================================================
	# 2. TOKENIZER (Character-Level)
	# ============================================================

	CHARS = list("0123456789+-*/=") + ["<pad>", "<bos>", "<eos>"]
	char2id = {c: i for i, c in enumerate(CHARS)}
	id2char = {i: c for c, i in char2id.items()}

	PAD_ID = char2id["<pad>"]
	BOS_ID = char2id["<bos>"]
	EOS_ID = char2id["<eos>"]
	VOCAB_SIZE = len(CHARS)

	def encode(text, max_len, add_bos=False, add_eos=True):
	tokens = []
	if add_bos:
	tokens.append(BOS_ID)
	for c in text:
	tokens.append(char2id.get(c, PAD_ID))
	if add_eos:
	tokens.append(EOS_ID)
	# Padding
	tokens = tokens[:max_len]
	tokens += [PAD_ID] * (max_len - len(tokens))
	return tokens

	def decode(token_ids):
	result = []
	for tid in token_ids:
	if tid == EOS_ID:
	break
	if tid in (PAD_ID, BOS_ID):
	continue
	result.append(id2char.get(tid, "?"))
	return "".join(result)

	# ============================================================
	# 3. DATA GENERATION
	# ============================================================

	def generate_sample(max_digits=3):
	op = random.choice(["+", "-", "*", "/"])

	if op == "+":
	a = random.randint(0, 10**max_digits - 1)
	b = random.randint(0, 10**max_digits - 1)
	result = a + b
	elif op == "-":
	a = random.randint(0, 10**max_digits - 1)
	b = random.randint(0, 10**max_digits - 1)
	result = a - b
	elif op == "*":
	a = random.randint(0, 10**(max_digits-1) - 1)
	b = random.randint(0, 10**(max_digits-1) - 1)
	result = a * b
	elif op == "/":
	b = random.randint(1, 10**(max_digits-1) - 1)
	result = random.randint(0, 10**(max_digits-1) - 1)
	a = b * result

	input_str = f"{a}{op}{b}="
	target_str = str(result)
	return input_str, target_str

	def generate_dataset(n_samples, max_digits=3):
	inputs, targets = [], []
	for _ in range(n_samples):
	inp, tgt = generate_sample(max_digits)
	inputs.append(inp)
	targets.append(tgt)
	return inputs, targets

	print("Generating training data...")
	t0 = time.time()
	train_inputs, train_targets = generate_dataset(TRAIN_SAMPLES, MAX_DIGITS)
	val_inputs, val_targets = generate_dataset(VAL_SAMPLES, MAX_DIGITS)
	print(f"Done in {time.time()-t0:.1f}s")
	print(f"Sample: '{train_inputs[0]}' → '{train_targets[0]}'")

	# ============================================================
	# 4. DATASET
	# ============================================================

	class MathDataset(Dataset):
	def __init__(self, inputs, targets):
	self.inputs = inputs
	self.targets = targets

	def __len__(self):
	return len(self.inputs)

	def __getitem__(self, idx):
	inp = self.inputs[idx]
	tgt = self.targets[idx]

	input_ids = encode(inp, MAX_INPUT_LEN, add_bos=False, add_eos=True)
	attention_mask = [1 if t != PAD_ID else 0 for t in input_ids]

	labels = encode(tgt, MAX_TARGET_LEN, add_bos=False, add_eos=True)
	labels = [t if t != PAD_ID else -100 for t in labels]

	decoder_input = [BOS_ID] + encode(tgt, MAX_TARGET_LEN-1, add_bos=False, add_eos=False)
	decoder_input = decoder_input[:MAX_TARGET_LEN]
	decoder_input += [PAD_ID] * (MAX_TARGET_LEN - len(decoder_input))

	return {
	"input_ids": torch.tensor(input_ids, dtype=torch.long),
	"attention_mask": torch.tensor(attention_mask, dtype=torch.long),
	"decoder_input_ids": torch.tensor(decoder_input, dtype=torch.long),
	"labels": torch.tensor(labels, dtype=torch.long),
	}

	train_dataset = MathDataset(train_inputs, train_targets)
	val_dataset = MathDataset(val_inputs, val_targets)

	train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
	num_workers=2, pin_memory=True)
	val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
	num_workers=2, pin_memory=True)

	# ============================================================
	# 5. MODEL (~1M parameters)
	# ============================================================

	config = T5Config(
	vocab_size=VOCAB_SIZE,
	d_model=128,
	d_ff=256,
	num_heads=4,
	num_layers=3, # Encoder layers
	num_decoder_layers=3, # Decoder layers
	d_kv=32,
	dropout_rate=0.1,
	feed_forward_proj="relu",
	is_encoder_decoder=True,
	pad_token_id=PAD_ID,
	eos_token_id=EOS_ID,
	decoder_start_token_id=BOS_ID,
	)

	model = T5ForConditionalGeneration(config).to(DEVICE)

	scaler = torch.cuda.amp.GradScaler()

	total_params = sum(p.numel() for p in model.parameters())
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f"\nTotal parameters: {total_params/1e6:.2f}M")
	print(f"Trainable: {trainable/1e6:.2f}M")

	# ============================================================
	# 6. OPTIMIZER & SCHEDULER
	# ============================================================

	optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
	total_steps = len(train_loader) * EPOCHS
	scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=LR/10)

	# ============================================================
	# 7. EVALUATION
	# ============================================================

	def evaluate(model, loader, n_examples=5):
	model.eval()
	correct = 0
	total = 0
	examples = []

	with torch.no_grad():
	for batch in loader:
	input_ids = batch["input_ids"].to(DEVICE)
	attention_mask = batch["attention_mask"].to(DEVICE)

	# Greedy generation
	generated = model.generate(
	input_ids=input_ids,
	attention_mask=attention_mask,
	max_new_tokens=MAX_TARGET_LEN,
	eos_token_id=EOS_ID,
	pad_token_id=PAD_ID,
	)

	labels = batch["labels"]

	for i in range(len(input_ids)):
	pred_ids = generated[i].cpu().tolist()
	pred_str = decode(pred_ids)

	lbl = labels[i].tolist()
	lbl = [t for t in lbl if t != -100]
	true_str = decode(lbl)

	is_correct = (pred_str == true_str)
	correct += int(is_correct)
	total += 1

	if len(examples) < n_examples:
	inp_str = decode(input_ids[i].cpu().tolist())
	examples.append((inp_str, true_str, pred_str, is_correct))

	accuracy = correct / total * 100
	return accuracy, examples

	# ============================================================
	# 8. TRAINING LOOP
	# ============================================================

	print("\n" + "="*60)
	print("TRAINING START")
	print("="*60)

	best_accuracy = 0.0

	for epoch in range(1, EPOCHS + 1):
	model.train()
	total_loss = 0.0
	steps = 0
	t_start = time.time()

	for batch in train_loader:
	input_ids = batch["input_ids"].to(DEVICE)
	attention_mask = batch["attention_mask"].to(DEVICE)
	decoder_input_ids = batch["decoder_input_ids"].to(DEVICE)
	labels = batch["labels"].to(DEVICE)

	optimizer.zero_grad()

	# Mixed Precision
	with torch.cuda.amp.autocast(dtype=torch.float16):
	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	decoder_input_ids=decoder_input_ids,
	labels=labels,
	)
	loss = outputs.loss

	scaler.scale(loss).backward()
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	scaler.step(optimizer)
	scaler.update()
	scheduler.step()

	total_loss += loss.item()
	steps += 1

	if steps % 500 == 0:
	avg_loss = total_loss / steps
	elapsed = time.time() - t_start
	print(f" Epoch {epoch} \| Step {steps}/{len(train_loader)} "
	f"\| Loss: {avg_loss:.4f} \| {elapsed:.0f}s")

	avg_loss = total_loss / steps

	# Validation
	print(f"\nEpoch {epoch} done. Evaluating...")
	accuracy, examples = evaluate(model, val_loader)

	print(f"\n{'='*60}")
	print(f"Epoch {epoch}/{EPOCHS}")
	print(f" Train loss: {avg_loss:.4f}")
	print(f" Val accuracy: {accuracy:.2f}%")
	print(f"\n Samples:")
	for inp, true, pred, ok in examples:
	status = "✅" if ok else "❌"
	print(f" {status} '{inp}' → expected: '{true}', got: '{pred}'")
	print("="*60)

	# Bestes Modell speichern
	if accuracy > best_accuracy:
	best_accuracy = accuracy
	torch.save({
	"model_state_dict": model.state_dict(),
	"config": config,
	"char2id": char2id,
	"id2char": id2char,
	"epoch": epoch,
	"accuracy": accuracy,
	}, SAVE_PATH)
	print(f" 💾 New best model saved! ({accuracy:.2f}%)")

	print(f"\nTraining done! Best accuracy: {best_accuracy:.2f}%")

	# ============================================================
	# 9. INFERENCE - TEST
	# ============================================================

	def predict(model, expression):
	model.eval()
	inp = expression + "="
	input_ids = torch.tensor(
	[encode(inp, MAX_INPUT_LEN, add_bos=False, add_eos=True)],
	dtype=torch.long
	).to(DEVICE)
	attention_mask = (input_ids != PAD_ID).long()

	with torch.no_grad():
	generated = model.generate(
	input_ids=input_ids,
	attention_mask=attention_mask,
	max_new_tokens=MAX_TARGET_LEN,
	eos_token_id=EOS_ID,
	pad_token_id=PAD_ID,
	)

	return decode(generated[0].cpu().tolist())

	print("\n" + "="*60)
	print("INFERENCE TEST")
	print("="*60)

	test_cases = [
	"123+456",
	"999-123",
	"12*34",
	"100/5",
	"500+500",
	"77*8",
	]

	for expr in test_cases:
	pred = predict(model, expr)
	try:
	true = str(eval(expr.replace("/", "//")))
	except:
	true = "?"
	status = "✅" if pred == true else "❌"
	print(f" {status} {expr} = {pred} (correct: {true})")