Spaces:
Sleeping
Sleeping
File size: 2,266 Bytes
c610360 10f17ec b2dcf42 362af59 c6b60a2 10f17ec b2dcf42 10f17ec b2dcf42 c610360 10f17ec c610360 b2dcf42 cf51c86 d9f2bb5 c610360 cf51c86 b2dcf42 c610360 b2dcf42 10f17ec c610360 10f17ec b2dcf42 c610360 10f17ec c610360 c6b60a2 cf51c86 10f17ec c610360 10f17ec b2dcf42 10f17ec c6b60a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
# Load the text dataset from the specified file
dataset = load_dataset("text", data_files="training.txt")
# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# Define a function to tokenize the dataset and prepare labels
def tokenize_function(examples):
# Tokenize the text to input_ids, attention_mask, with reduced max_length
tokenized_inputs = tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=256 # Reduced from 512 to 256
)
# Prepare labels: labels are the same as input_ids for language modeling
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
return tokenized_inputs
# Tokenize the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the 'text' column as it's no longer needed after tokenization
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# Set the format of the dataset to PyTorch tensors
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Define training arguments with adjusted settings
training_args = TrainingArguments(
output_dir="./output",
overwrite_output_dir=True,
num_train_epochs=2, # Optionally reduced for quicker iteration
per_device_train_batch_size=2, # Reduced from 4 to 2
gradient_accumulation_steps=16, # Added to compensate for smaller batch size
save_steps=10_000,
save_total_limit=2,
)
# Initialize the Trainer with the training dataset including labels
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
)
# Start the training process
trainer.train()
model.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
tokenizer.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
|