from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) import torch import os model_output_path = "./model/medical_llama_3b" os.makedirs(model_output_path, exist_ok=True) model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) dataset = load_dataset("json", data_files="medical_dataset.json") def preprocess_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) tokenized_dataset = dataset.map( preprocess_function, batched=True, remove_columns=dataset["train"].column_names ) training_args = TrainingArguments( output_dir="./model/medical_llama_3b/checkpoints", per_device_train_batch_size=4, gradient_accumulation_steps=4, num_train_epochs=3, learning_rate=2e-5, fp16=True, save_steps=500, logging_steps=100, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), ) trainer.train() model.save_pretrained(model_output_path) tokenizer.save_pretrained(model_output_path) print(f"Model and tokenizer saved to: {model_output_path}")