eibeel commited on
Commit
eb7fcd4
1 Parent(s): 09e20c2

Update LLAMA_Fine-Tuning.py

Browse files
Files changed (1) hide show
  1. LLAMA_Fine-Tuning.py +60 -2
LLAMA_Fine-Tuning.py CHANGED
@@ -6,7 +6,7 @@ Automatically generated by Colab.
6
  Original file is located at
7
  https://colab.research.google.com/drive/1C-kNPOgPiCC9ybxVKhOkWB9ts53APbOb
8
 
9
- # Fine-tune Llama 2 in Google Colab
10
  """
11
 
12
  !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
@@ -38,7 +38,7 @@ token_name = "XXXX"
38
 
39
 
40
  # Fine-tuned model name
41
- new_model = "llama-2-7b-minipython"
42
 
43
  ################################################################################
44
  # QLoRA parameters
@@ -194,6 +194,64 @@ trainer.train()
194
 
195
  model.save_pretrained(new_model)
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # Commented out IPython magic to ensure Python compatibility.
198
  # %load_ext tensorboard
199
  # %tensorboard --logdir results/runs
 
6
  Original file is located at
7
  https://colab.research.google.com/drive/1C-kNPOgPiCC9ybxVKhOkWB9ts53APbOb
8
 
9
+ # Fine-tune Llama 3 in Google Colab
10
  """
11
 
12
  !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
 
38
 
39
 
40
  # Fine-tuned model name
41
+ new_model = "llama3_python_TFG"
42
 
43
  ################################################################################
44
  # QLoRA parameters
 
194
 
195
  model.save_pretrained(new_model)
196
 
197
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
198
+ from datasets import Dataset
199
+
200
+ def load_text_file(file_path):
201
+ with open(file_path, 'r', encoding='utf-8') as f:
202
+ return [line.strip() for line in f if line.strip()]
203
+
204
+ train_texts = load_text_file('LLAMA_DatosEntrenamiento.txt')
205
+ val_texts = load_text_file('LLAMA_DatosValidacion.txt')
206
+
207
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=token_name)
208
+
209
+ def tokenize_and_encode(texts):
210
+ encodings = tokenizer(texts, truncation=True, padding="longest", max_length=512, return_tensors="pt")
211
+ encodings['labels'] = encodings['input_ids'].clone() # Duplicar input_ids para usar como labels
212
+ return encodings
213
+
214
+ train_encodings = tokenize_and_encode(train_texts)
215
+ val_encodings = tokenize_and_encode(val_texts)
216
+
217
+ train_dataset = Dataset.from_dict({key: val.numpy() for key, val in train_encodings.items()})
218
+ val_dataset = Dataset.from_dict({key: val.numpy() for key, val in val_encodings.items()})
219
+
220
+ training_arguments = TrainingArguments(
221
+ output_dir=output_dir,
222
+ evaluation_strategy="steps", # Evaluar basado en el número de pasos
223
+ eval_steps=500, # Evaluar cada 500 pasos
224
+ num_train_epochs=1,
225
+ per_device_train_batch_size=2,
226
+ logging_steps=logging_steps,
227
+ save_steps=1000, # Guardar el modelo cada 1000 pasos para reducir la frecuencia de escritura en disco
228
+ learning_rate=2e-4,
229
+ weight_decay=0.001,
230
+ lr_scheduler_type="cosine",
231
+ warmup_ratio=0.03,
232
+ report_to="tensorboard",
233
+ fp16=False # Desactivar la precisión mixta para simplificar el entrenamiento
234
+ )
235
+
236
+ model = AutoModelForCausalLM.from_pretrained(model_name, token=token_name)
237
+
238
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
239
+
240
+ trainer = Trainer(
241
+ model=model,
242
+ args=training_arguments,
243
+ train_dataset=train_dataset,
244
+ eval_dataset=val_dataset,
245
+ data_collator=data_collator
246
+ )
247
+
248
+ trainer.train()
249
+
250
+ model.save_pretrained(new_model)
251
+
252
+ model.push_to_hub("eibeel/llama3-python-TFG")
253
+
254
+
255
  # Commented out IPython magic to ensure Python compatibility.
256
  # %load_ext tensorboard
257
  # %tensorboard --logdir results/runs