philschmid
/

gemma-7b-dolly-chatml

+import torch
+from peft import AutoPeftModelForCausalLM
+from transformers import  AutoTokenizer, pipeline
+peft_model_id = "philschmid/gemma-7b-dolly-chatml"
+# Load Model with PEFT adapter
+tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
+model = AutoPeftModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", torch_dtype=torch.float16)
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+# run inference
+messages = [
+    {
+        "role": "user",
+        "content": "What is the capital of Germany? Explain why thats the case and if it was different in the past?"
+    }
+]
+prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, pad_token_id=pipe.tokenizer.pad_token_id, eos_token_id=pipe.tokenizer.eos_token_id)
+print(outputs[0]["generated_text"])
+# run inference
+messages = [
+    {
+        "role": "user",
+        "content": "In a town, 60% of the population are adults. Among the adults, 30% have a pet dog and 40% have a pet cat. What percentage of the total population has a pet dog?"
+    }
+]
+prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, pad_token_id=pipe.tokenizer.pad_token_id, eos_token_id=pipe.tokenizer.eos_token_id)
+print(outputs[0]["generated_text"])
+# pip3 list | grep -e transformers -e peft -e torch -e trl -e accelerate

trl-lora.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from datasets import load_dataset
+from transformers import TrainingArguments
+from trl import SFTTrainer
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import LoraConfig
+# Load jsonl data from disk
+dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
+# Hugging Face model id
+model_id = "google/gemma-7b"
+tokenizer_id = "philschmid/gemma-tokenizer-chatml"
+# Load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+tokenizer.padding_side = 'right' # to prevent warnings
+# LoRA config based on QLoRA paper & Sebastian Raschka experiment
+peft_config = LoraConfig(
+        lora_alpha=8,
+        lora_dropout=0.05,
+        r=16,
+        bias="none",
+        target_modules="all-linear",
+        task_type="CAUSAL_LM",
+)
+args = TrainingArguments(
+    output_dir="gemma-7b-dolly-chatml", # directory to save and repository id
+    num_train_epochs=3,                     # number of training epochs
+    per_device_train_batch_size=8,          # batch size per device during training
+    gradient_checkpointing=True,            # use gradient checkpointing to save memory
+    optim="adamw_torch_fused",              # use fused adamw optimizer
+    logging_steps=10,                       # log every 10 steps
+    save_strategy="epoch",                  # save checkpoint every epoch
+    bf16=True,                              # use bfloat16 precision
+    tf32=True,                              # use tf32 precision
+    ### peft specific arguments ###
+    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
+    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
+    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
+    lr_scheduler_type="constant",           # use constant learning rate scheduler
+    report_to="tensorboard",                # report metrics to tensorboard
+    push_to_hub=True,                       # push model to hub
+)
+max_seq_length = 1512 # max sequence length for model and packing of the dataset
+trainer = SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=dataset,
+    ### peft specific arguments ###
+    peft_config=peft_config,
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    packing=True,
+    dataset_kwargs={
+        "add_special_tokens": True, # make sure we add <bos> and <eos> tokens
+        "append_concat_token": False, # make sure to not add additional tokens when packing
+    }
+)
+# start training, the model will be automatically saved to the hub and the output directory
+trainer.train()
+# save model
+trainer.save_model()