philschmid HF staff commited on
Commit
48c7a89
·
verified ·
1 Parent(s): 4070fa7

Upload 2 files

Browse files
Files changed (2) hide show
  1. inference.py +37 -0
  2. trl-lora.py +76 -0
inference.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import AutoPeftModelForCausalLM
3
+ from transformers import AutoTokenizer, pipeline
4
+
5
+ peft_model_id = "philschmid/gemma-7b-dolly-chatml"
6
+
7
+ # Load Model with PEFT adapter
8
+ tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
9
+ model = AutoPeftModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", torch_dtype=torch.float16)
10
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
11
+
12
+ # run inference
13
+ messages = [
14
+ {
15
+ "role": "user",
16
+ "content": "What is the capital of Germany? Explain why thats the case and if it was different in the past?"
17
+ }
18
+ ]
19
+
20
+ prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
21
+ outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, pad_token_id=pipe.tokenizer.pad_token_id, eos_token_id=pipe.tokenizer.eos_token_id)
22
+ print(outputs[0]["generated_text"])
23
+
24
+ # run inference
25
+ messages = [
26
+ {
27
+ "role": "user",
28
+ "content": "In a town, 60% of the population are adults. Among the adults, 30% have a pet dog and 40% have a pet cat. What percentage of the total population has a pet dog?"
29
+ }
30
+ ]
31
+
32
+ prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
33
+ outputs = pipe(prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, pad_token_id=pipe.tokenizer.pad_token_id, eos_token_id=pipe.tokenizer.eos_token_id)
34
+ print(outputs[0]["generated_text"])
35
+
36
+
37
+ # pip3 list | grep -e transformers -e peft -e torch -e trl -e accelerate
trl-lora.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import TrainingArguments
3
+ from trl import SFTTrainer
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from peft import LoraConfig
7
+
8
+ # Load jsonl data from disk
9
+ dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
10
+
11
+ # Hugging Face model id
12
+ model_id = "google/gemma-7b"
13
+ tokenizer_id = "philschmid/gemma-tokenizer-chatml"
14
+
15
+ # Load model and tokenizer
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ model_id,
18
+ device_map="auto",
19
+ attn_implementation="flash_attention_2",
20
+ torch_dtype=torch.bfloat16,
21
+ )
22
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
23
+ tokenizer.padding_side = 'right' # to prevent warnings
24
+
25
+ # LoRA config based on QLoRA paper & Sebastian Raschka experiment
26
+ peft_config = LoraConfig(
27
+ lora_alpha=8,
28
+ lora_dropout=0.05,
29
+ r=16,
30
+ bias="none",
31
+ target_modules="all-linear",
32
+ task_type="CAUSAL_LM",
33
+ )
34
+
35
+ args = TrainingArguments(
36
+ output_dir="gemma-7b-dolly-chatml", # directory to save and repository id
37
+ num_train_epochs=3, # number of training epochs
38
+ per_device_train_batch_size=8, # batch size per device during training
39
+ gradient_checkpointing=True, # use gradient checkpointing to save memory
40
+ optim="adamw_torch_fused", # use fused adamw optimizer
41
+ logging_steps=10, # log every 10 steps
42
+ save_strategy="epoch", # save checkpoint every epoch
43
+ bf16=True, # use bfloat16 precision
44
+ tf32=True, # use tf32 precision
45
+ ### peft specific arguments ###
46
+ learning_rate=2e-4, # learning rate, based on QLoRA paper
47
+ max_grad_norm=0.3, # max gradient norm based on QLoRA paper
48
+ warmup_ratio=0.03, # warmup ratio based on QLoRA paper
49
+ lr_scheduler_type="constant", # use constant learning rate scheduler
50
+ report_to="tensorboard", # report metrics to tensorboard
51
+ push_to_hub=True, # push model to hub
52
+
53
+ )
54
+
55
+ max_seq_length = 1512 # max sequence length for model and packing of the dataset
56
+
57
+ trainer = SFTTrainer(
58
+ model=model,
59
+ args=args,
60
+ train_dataset=dataset,
61
+ ### peft specific arguments ###
62
+ peft_config=peft_config,
63
+ max_seq_length=max_seq_length,
64
+ tokenizer=tokenizer,
65
+ packing=True,
66
+ dataset_kwargs={
67
+ "add_special_tokens": True, # make sure we add <bos> and <eos> tokens
68
+ "append_concat_token": False, # make sure to not add additional tokens when packing
69
+ }
70
+ )
71
+
72
+ # start training, the model will be automatically saved to the hub and the output directory
73
+ trainer.train()
74
+
75
+ # save model
76
+ trainer.save_model()