robkaandorp commited on
Commit
3ebfa66
1 Parent(s): 9a639d2

Finish training script

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. test_queries.py +69 -0
  3. train_dataset.py +98 -3
.gitattributes CHANGED
@@ -24,6 +24,7 @@
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
 
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ results/**/* filter=lfs diff=lfs merge=lfs -text
28
  *.tar.* filter=lfs diff=lfs merge=lfs -text
29
  *.tar filter=lfs diff=lfs merge=lfs -text
30
  *.tflite filter=lfs diff=lfs merge=lfs -text
test_queries.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
4
+
5
+ if torch.cuda.is_available():
6
+ print("Cuda is available")
7
+
8
+ # base_model_id = "microsoft/phi-2"
9
+ # base_model_id = "abacaj/phi-2-super"
10
+ base_model_id = "./results"
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
13
+ if tokenizer.pad_token is None:
14
+ tokenizer.pad_token = tokenizer.eos_token
15
+ print("pad_token was missing and has been set to eos_token")
16
+
17
+ tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ bos_token + 'Instruct: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'Output: ' + message['content'] + eos_token }}{% endif %}{% endfor %}"
18
+
19
+ model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16).to('cuda')
20
+ print(model)
21
+
22
+ meta_messages = [
23
+ [
24
+ { "role": "user", "content": "You are an AI assistant that will be answering phone calls from participants of the Nowhere event, a regional Burning Man event in Spain. The phone the participant is using will be on-site at or near the Oasis Playground barrio. Your answer will be short and to the point. Conversation with the participant will be solely through voice prompts, with the use of speech-to-text and text-to-speech software. You as the assistent will provide in your answers the correct hippie-like vibe for this type of event." },
25
+ { "role": "assistant", "content": "Ofcourse, hippie, I will try my best for you!" },
26
+ ],
27
+ [ { "role": "user", "content": "Hello, who are you?" } ],
28
+ [ { "role": "user", "content": "Where are we?" } ],
29
+ [ { "role": "user", "content": "What can I do here?" } ],
30
+ [ { "role": "user", "content": "It is so hot I am getting an headache!" } ],
31
+ [ { "role": "user", "content": "How do I use the toilets?" } ],
32
+ [ { "role": "user", "content": "What is a Nobody?" } ],
33
+ ]
34
+
35
+ with torch.no_grad():
36
+ for messages in meta_messages:
37
+ for msg in messages:
38
+ print(f"{msg['role']}: {msg['content']}")
39
+
40
+ add_generation_prompt = True
41
+ if len(messages) == 2:
42
+ add_generation_prompt = False
43
+
44
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, return_tensors="pt").to(model.device)
45
+ input_ids_cutoff = inputs.size(dim=1)
46
+
47
+ start_time = time.time()
48
+
49
+ generated_ids = model.generate(
50
+ input_ids=inputs,
51
+ use_cache=True,
52
+ max_new_tokens=512,
53
+ temperature=0.2,
54
+ top_p=0.95,
55
+ do_sample=True,
56
+ eos_token_id=tokenizer.eos_token_id,
57
+ pad_token_id=tokenizer.pad_token_id,
58
+ )
59
+
60
+ duration = float(time.time() - start_time)
61
+
62
+ generated = generated_ids[0][input_ids_cutoff:]
63
+
64
+ completion = tokenizer.decode(
65
+ generated,
66
+ skip_special_tokens=True,
67
+ )
68
+
69
+ print(f"assistant: {completion} | {len(generated)} tokens, {round(len(generated)/duration, 3)} tokens/sec")
train_dataset.py CHANGED
@@ -3,6 +3,12 @@ from langchain_community.embeddings.sentence_transformer import (
3
  )
4
  from langchain_community.vectorstores import Chroma
5
 
 
 
 
 
 
 
6
  # create the open-source embedding function
7
  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
8
 
@@ -11,7 +17,96 @@ db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_d
11
 
12
  print("There are", db._collection.count(), " docs in the collection")
13
 
14
- docs = db._collection.peek(10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- for doc in docs['documents']:
17
- print(doc)
 
3
  )
4
  from langchain_community.vectorstores import Chroma
5
 
6
+ import time
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
9
+ from trl import SFTTrainer
10
+ from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
11
+
12
  # create the open-source embedding function
13
  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
14
 
 
17
 
18
  print("There are", db._collection.count(), " docs in the collection")
19
 
20
+ docs = db._collection.peek(db._collection.count())
21
+ dataset = docs['documents']
22
+
23
+ if torch.cuda.is_available():
24
+ # torch.set_default_device("cuda")
25
+ print("Cuda is available")
26
+
27
+ base_model_id = "microsoft/phi-2"
28
+ # base_model_id = "abacaj/phi-2-super"
29
+ # base_model_id = "./results"
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
32
+ if tokenizer.pad_token is None:
33
+ tokenizer.pad_token = tokenizer.eos_token
34
+ print("pad_token was missing and has been set to eos_token")
35
+
36
+ # Configuration to load model in 4-bit quantized
37
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True,
38
+ bnb_4bit_quant_type='nf4',
39
+ #bnb_4bit_compute_dtype='float16',
40
+ bnb_4bit_compute_dtype=torch.bfloat16,
41
+ bnb_4bit_use_double_quant=False)
42
+
43
+ model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", quantization_config=bnb_config, torch_dtype="auto")
44
+ print(model)
45
+
46
+ # Gradient checkpointing to save memory
47
+ model.gradient_checkpointing_enable()
48
+
49
+ # Freeze base model layers and cast layernorm in fp32
50
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
51
+
52
+ peft_config = LoraConfig(
53
+ r=64,
54
+ lora_alpha=64,
55
+ target_modules= ["q_proj","k_proj","v_proj","dense","fc2","fc1"],
56
+ bias="none",
57
+ lora_dropout=0.05,
58
+ task_type="CAUSAL_LM",
59
+ )
60
+
61
+ training_args = TrainingArguments(
62
+ output_dir='./results', # Output directory for checkpoints and predictions
63
+ overwrite_output_dir=True, # Overwrite the content of the output directory
64
+ per_device_train_batch_size=2, # Batch size for training
65
+ per_device_eval_batch_size=2, # Batch size for evaluation
66
+ gradient_accumulation_steps=5, # number of steps before optimizing
67
+ gradient_checkpointing=True, # Enable gradient checkpointing
68
+ gradient_checkpointing_kwargs={"use_reentrant": False},
69
+ warmup_steps=10, # Number of warmup steps
70
+ #max_steps=1000, # Total number of training steps
71
+ num_train_epochs=20, # Number of training epochs
72
+ learning_rate=5e-5, # Learning rate
73
+ weight_decay=0.01, # Weight decay
74
+ optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
75
+ bf16=True, #Use mixed precision training
76
+ #For logging and saving
77
+ logging_dir='./logs',
78
+ logging_strategy="epoch",
79
+ logging_steps=10,
80
+ save_strategy="epoch",
81
+ save_steps=10,
82
+ save_total_limit=2, # Limit the total number of checkpoints
83
+ evaluation_strategy="epoch",
84
+ eval_steps=10,
85
+ load_best_model_at_end=True, # Load the best model at the end of training
86
+ lr_scheduler_type="linear",
87
+ )
88
+
89
+ def formatting_func(doc):
90
+ return doc
91
+
92
+ trainer = SFTTrainer(
93
+ model=model,
94
+ train_dataset=dataset,
95
+ eval_dataset=dataset,
96
+ peft_config=peft_config,
97
+ args=training_args,
98
+ max_seq_length=1024,
99
+ packing=True,
100
+ formatting_func=formatting_func
101
+ )
102
+
103
+ model.config.use_cache = False # silence the warnings. Please re-enable for inference!
104
+
105
+ start_time = time.time() # Record the start time
106
+ trainer.train()
107
+ end_time = time.time() # Record the end time
108
+
109
+ training_time = end_time - start_time # Calculate total training time
110
 
111
+ trainer.save_model("./results")
112
+ print(f"Training completed in {training_time} seconds.")