eibeel commited on
Commit
466026d
1 Parent(s): ad2b115

Update LLAMA_Fine-Tuning.py

Browse files
Files changed (1) hide show
  1. LLAMA_Fine-Tuning.py +3 -93
LLAMA_Fine-Tuning.py CHANGED
@@ -90,7 +90,7 @@ per_device_train_batch_size = 4
90
  per_device_eval_batch_size = 4
91
 
92
  # Number of update steps to accumulate the gradients for
93
- gradient_accumulation_steps = 1
94
 
95
  # Enable gradient checkpointing
96
  gradient_checkpointing = True
@@ -167,62 +167,7 @@ training_arguments = TrainingArguments(
167
  evaluation_strategy="steps", # Evaluar basado en el número de pasos
168
  eval_steps=500, # Evaluar cada 500 pasos
169
  num_train_epochs=1,
170
- per_device_train_batch_size=2,
171
- logging_steps=logging_steps,
172
- save_steps=1000, # Guardar el modelo cada 1000 pasos para reducir la frecuencia de escritura en disco
173
- learning_rate=2e-4,
174
- weight_decay=0.001,
175
- lr_scheduler_type="cosine",
176
- warmup_ratio=0.03,
177
- report_to="tensorboard",
178
- fp16=False # Desactivar la precisión mixta para simplificar el entrenamiento
179
- )
180
-
181
- model = AutoModelForCausalLM.from_pretrained(model_name, token=token_name)
182
-
183
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
184
-
185
- trainer = Trainer(
186
- model=model,
187
- args=training_arguments,
188
- train_dataset=train_dataset,
189
- eval_dataset=val_dataset,
190
- data_collator=data_collator
191
- )
192
-
193
- trainer.train()
194
-
195
- model.save_pretrained(new_model)
196
-
197
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
198
- from datasets import Dataset
199
-
200
- def load_text_file(file_path):
201
- with open(file_path, 'r', encoding='utf-8') as f:
202
- return [line.strip() for line in f if line.strip()]
203
-
204
- train_texts = load_text_file('LLAMA_DatosEntrenamiento.txt')
205
- val_texts = load_text_file('LLAMA_DatosValidacion.txt')
206
-
207
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=token_name)
208
-
209
- def tokenize_and_encode(texts):
210
- encodings = tokenizer(texts, truncation=True, padding="longest", max_length=512, return_tensors="pt")
211
- encodings['labels'] = encodings['input_ids'].clone() # Duplicar input_ids para usar como labels
212
- return encodings
213
-
214
- train_encodings = tokenize_and_encode(train_texts)
215
- val_encodings = tokenize_and_encode(val_texts)
216
-
217
- train_dataset = Dataset.from_dict({key: val.numpy() for key, val in train_encodings.items()})
218
- val_dataset = Dataset.from_dict({key: val.numpy() for key, val in val_encodings.items()})
219
-
220
- training_arguments = TrainingArguments(
221
- output_dir=output_dir,
222
- evaluation_strategy="steps", # Evaluar basado en el número de pasos
223
- eval_steps=500, # Evaluar cada 500 pasos
224
- num_train_epochs=1,
225
- per_device_train_batch_size=2,
226
  logging_steps=logging_steps,
227
  save_steps=1000, # Guardar el modelo cada 1000 pasos para reducir la frecuencia de escritura en disco
228
  learning_rate=2e-4,
@@ -254,39 +199,4 @@ model.push_to_hub("eibeel/llama3-python-TFG")
254
 
255
  # Commented out IPython magic to ensure Python compatibility.
256
  # %load_ext tensorboard
257
- # %tensorboard --logdir results/runs
258
-
259
- train_texts[2]
260
-
261
- # Ignore warnings
262
- logging.set_verbosity(logging.CRITICAL)
263
-
264
- # Run text generation pipeline with our next model
265
- prompt = "Como puedo encontrar trabajo de ingeniero?"
266
- pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
267
- result = pipe(f"<s>[INST] {prompt} [/INST]")
268
- print(result[0]['generated_text'])
269
-
270
- # Empty VRAM
271
- del model
272
- del pipe
273
- del trainer
274
- import gc
275
- gc.collect()
276
- gc.collect()
277
-
278
- # Reload model in FP16 and merge it with LoRA weights
279
- base_model = AutoModelForCausalLM.from_pretrained(
280
- model_name,
281
- low_cpu_mem_usage=True,
282
- return_dict=True,
283
- torch_dtype=torch.float16,
284
- device_map=device_map,
285
- )
286
- model = PeftModel.from_pretrained(base_model, new_model)
287
- model = model.merge_and_unload()
288
-
289
- # Reload tokenizer to save it
290
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
291
- tokenizer.pad_token = tokenizer.eos_token
292
- tokenizer.padding_side = "right"
 
90
  per_device_eval_batch_size = 4
91
 
92
  # Number of update steps to accumulate the gradients for
93
+ gradient_accumulation_steps = 2
94
 
95
  # Enable gradient checkpointing
96
  gradient_checkpointing = True
 
167
  evaluation_strategy="steps", # Evaluar basado en el número de pasos
168
  eval_steps=500, # Evaluar cada 500 pasos
169
  num_train_epochs=1,
170
+ per_device_train_batch_size=4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  logging_steps=logging_steps,
172
  save_steps=1000, # Guardar el modelo cada 1000 pasos para reducir la frecuencia de escritura en disco
173
  learning_rate=2e-4,
 
199
 
200
  # Commented out IPython magic to ensure Python compatibility.
201
  # %load_ext tensorboard
202
+ # %tensorboard --logdir results/runs