from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
Cargar modelo y tokenizer preentrenado
model_name = "gpt2" model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name)
Crear dataset a partir de los textos del escritor
def load_dataset(file_path, tokenizer, block_size=128): dataset = TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=block_size ) return dataset
file_path = "ruta/al/texto/del/escritor.txt" dataset = load_dataset(file_path, tokenizer)
Crear collator para el dataset
data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False )
Definir parámetros de entrenamiento
training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=2, save_steps=10_000, save_total_limit=2, )
Crear Trainer y entrenar el modelo
trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset )
trainer.train()