import os import torch import trl from transformers import AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM, TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup from datasets import load_dataset, Dataset from tokenizers import ByteLevelBPETokenizer BATCH_SIZE = 192 EPOCHS = 2 LEARNING_RATE = 2e-4 FACTOR = 22 * 30 MAX_SEQ_LENGTH = 128 VOCAB_SIZE = 32000 INPUT_DATASET = "HuggingFaceTB/smollm-corpus" INSTRUCT_DATASET = "nroggendorff/elephant" OUTPUT_REPO = "nroggendorff/smallama" INSTRUCT_FINETUNE_BOOL = False FP16 = False WARMUP_STEPS = 0 DECAY = 0 GRADIENT_ACCUMULATION_STEPS = 1 PUSH_TO_HUB = True def load_data(): if not INSTRUCT_FINETUNE_BOOL: dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True) dataset = Dataset.from_generator(lambda: dataset.take(int(9e+5))) else: dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True) dataset = Dataset.from_generator(lambda: dataset.take(int(5e+5))) return dataset def create_tokenizer(training_corpus): tokenizer = ByteLevelBPETokenizer() special_tokens = ["", "", "", "", ""] if INSTRUCT_FINETUNE_BOOL: special_tokens.append(["<|user|>", "<|bot|>", "<|end|>"]) tokenizer.train_from_iterator( training_corpus, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=special_tokens ) fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer) return fast_tokenizer def load_tokenizer(): tokenizer = AutoTokenizer.from_pretrained(OUTPUT_REPO) return tokenizer def get_training_corpus(dataset): texts = [] #for field in ['pretrain', 'instruct']: # texts.extend(dataset[field]['text']) texts.extend(dataset['text']) for i in range(0, len(texts), 1000): yield texts[i : i + 1000] def format_prompts(examples, tokenizer, isinst): texts = [] for text in examples['text']: if isinst: conversation = [] parts = text.split('<|end|>') for i in range(0, len(parts) - 1, 2): prompt = parts[i].replace("<|user|>", "") response = parts[i + 1].replace("<|bot|>", "") conversation.append({"role": "user", "content": prompt}) conversation.append({"role": "assistant", "content": response}) formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False) texts.append(formatted_conversation) else: texts.append(tokenizer.bos_token + text + tokenizer.eos_token) return {"text": texts} def create_model(tokenizer): config = LlamaConfig( vocab_size=tokenizer.vocab_size, hidden_size=FACTOR, intermediate_size=FACTOR * 4, num_hidden_layers=max(1, FACTOR // 32), num_attention_heads=max(1, FACTOR // 64), max_position_embeddings=MAX_SEQ_LENGTH, rms_norm_eps=1e-6, initializer_range=0.02, use_cache=True, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, tie_word_embeddings=False, ) model = LlamaForCausalLM(config) return model def load_model(): model = AutoModelForCausalLM.from_pretrained(OUTPUT_REPO) return model def configure_tokenizer(tokenizer): special_tokens = { "bos_token": "", "eos_token": "", "unk_token": "", "pad_token": "", "mask_token": "" } if INSTRUCT_FINETUNE_BOOL: special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"] tokenizer.add_special_tokens(special_tokens) if INSTRUCT_FINETUNE_BOOL: tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>") tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>") chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" tokenizer.chat_template = chat_template def train_model(model, tokenizer, dataset, push, isinst): args = TrainingArguments( output_dir="model", num_train_epochs=EPOCHS, per_device_train_batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, optim="adamw_torch", warmup_steps=WARMUP_STEPS, weight_decay=DECAY, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, fp16=FP16, save_steps=int(1e+10), logging_steps=10 ) optimizer = AdamW(model.parameters(), lr=args.learning_rate) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs ) dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names) trainer = trl.SFTTrainer( model=model, tokenizer=tokenizer, args=args, train_dataset=dataset, dataset_text_field='text', max_seq_length=MAX_SEQ_LENGTH, optimizers=(optimizer, scheduler) ) train = trainer.train() trained_model = trainer.model trained_tokenizer = trainer.tokenizer if push: if INSTRUCT_FINETUNE_BOOL: repo_id = OUTPUT_REPO + "-it" else: repo_id = OUTPUT_REPO msg = str(train.training_loss) trained_model.push_to_hub(repo_id, commit_message=msg, force=True) trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True) else: trained_model.save_pretrained("model") trained_tokenizer.save_pretrained("tokenizer") def main(push_to_hub=True, is_inst_finetune=False): dataset = load_data() if not is_inst_finetune: training_corpus = get_training_corpus(dataset) tokenizer = create_tokenizer(training_corpus) else: tokenizer = load_tokenizer() configure_tokenizer(tokenizer) if is_inst_finetune: model = load_model() model.resize_token_embeddings(len(tokenizer)) train_model(model, tokenizer, dataset, push_to_hub, True) else: model = create_model(tokenizer) train_model(model, tokenizer, dataset, push_to_hub, False) if __name__ == "__main__": main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL) raise RuntimeError("The script is finished.")