import os import sys import json from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments from trl import SFTTrainer os.environ["WANDB_MODE"] = "disabled" DRY_RUN = "--dry-run" in sys.argv MODEL_ID = "Salesforce/codegen-350M-multi" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained(MODEL_ID) # 1) load your JSONL ds = load_dataset("json", data_files="data/train_dataset.jsonl", split="train") # 2) tokenize & format def tokenize(example): prompt = f"DIFF:\n{example['diff']}\n\nOUTPUT FORMAT:\n" output = example.get("comments", example.get("comment", [])) text = prompt + tokenizer.decode(tokenizer.encode(json.dumps(output, ensure_ascii=False), add_special_tokens=False)) tokens = tokenizer(text, truncation=True, max_length=512) tokens["labels"] = tokens["input_ids"].copy() return tokens # In dry‐run, only map a couple examples if DRY_RUN: sample = ds.select(range(2)) print("Sample examples before tokenization:") for ex in sample: print(ex) tokenized = sample.map(tokenize, remove_columns=sample.column_names) print("\nAfter tokenization, examples look like:") for ex in tokenized: print({k: ex[k] for k in ["input_ids","labels"]}) else: ds = ds.map(tokenize, remove_columns=ds.column_names) # 3) configure args training_args = TrainingArguments( output_dir = "sft-model", # where to write checkpoints overwrite_output_dir = True, do_train = True, # we’re doing a train run num_train_epochs = 3, # full passes over the data per_device_train_batch_size = 2, gradient_accumulation_steps = 8, learning_rate = 2e-5, max_steps = 500, # total optimization steps (overrides epochs) logging_strategy = "steps", logging_steps = 50, save_strategy = "steps", save_steps = 200, fp16 = False, # no half‐precision on CPU report_to = [], # disable WandB/others ) # 4) instantiate trainer trainer = SFTTrainer( model=model, args=training_args, train_dataset=(tokenized if DRY_RUN else ds), ) if DRY_RUN: print(f"\n✅ DRY-RUN: Trainer instantiated:\n – model: {type(model)}\n – tokenizer: {type(tokenizer)}\n – train_dataset size: {len(tokenized if DRY_RUN else ds)}") print(f" – SFTTrainingArguments: {training_args}") if not DRY_RUN: # only run the real training if you didn’t pass --dry-run trainer.train() trainer.save_model("sft-model")