File size: 2,266 Bytes
c610360
10f17ec
 
b2dcf42
362af59
c6b60a2
10f17ec
b2dcf42
10f17ec
 
b2dcf42
c610360
10f17ec
c610360
b2dcf42
cf51c86
 
d9f2bb5
c610360
cf51c86
b2dcf42
c610360
b2dcf42
10f17ec
c610360
 
 
 
 
 
10f17ec
b2dcf42
 
 
c610360
10f17ec
 
 
c610360
 
c6b60a2
cf51c86
 
10f17ec
 
c610360
10f17ec
 
 
 
 
 
b2dcf42
10f17ec
 
c6b60a2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load the text dataset from the specified file
dataset = load_dataset("text", data_files="training.txt")


# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize the dataset and prepare labels
def tokenize_function(examples):
    # Tokenize the text to input_ids, attention_mask, with reduced max_length
    tokenized_inputs = tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True,  
        max_length=256  # Reduced from 512 to 256
    )
    # Prepare labels: labels are the same as input_ids for language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# Tokenize the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the 'text' column as it's no longer needed after tokenization
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# Set the format of the dataset to PyTorch tensors
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training arguments with adjusted settings
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=2,  # Optionally reduced for quicker iteration
    per_device_train_batch_size=2,  # Reduced from 4 to 2
    gradient_accumulation_steps=16,  # Added to compensate for smaller batch size
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer with the training dataset including labels
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
)

# Start the training process
trainer.train()

model.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')
tokenizer.save_pretrained('C:\\Users\\Mike Corrigan\\Documents\\DMD Year 4 Sem 1\\FYP INFO\\fyp hugging face\\fypmc20277423\\output\\fine_tuned_gpt2_model')