Corrigan123 commited on
Commit
b2dcf42
1 Parent(s): cdecb4d

Update app.py with optimized training settings

Browse files
Files changed (1) hide show
  1. app.py +20 -9
app.py CHANGED
@@ -2,32 +2,40 @@ from transformers import (GPT2Tokenizer, GPT2LMHeadModel, Trainer,
2
  TrainingArguments, DataCollatorWithPadding)
3
  from datasets import load_dataset
4
 
5
- # Load the GPT-2 model
6
- model = GPT2LMHeadModel.from_pretrained("gpt2")
7
 
8
- # Initialize the GPT-2 tokenizer with a reduced max_length
9
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
10
  tokenizer.pad_token = tokenizer.eos_token
11
- max_length = 256 # Reduced max_length
 
 
12
 
13
  def tokenize_function(examples):
14
- return tokenizer(
 
15
  examples["text"],
16
  padding="max_length",
17
  truncation=True,
18
  max_length=max_length,
19
  return_tensors="pt"
20
  )
 
 
 
21
 
22
- # Load and preprocess the dataset
23
- dataset = load_dataset("text", data_files="training.txt")
24
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
25
  tokenized_datasets = tokenized_datasets.remove_columns(["text"])
26
  tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
27
 
28
  # Use a DataCollator that dynamically pads the batches
29
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
30
 
 
 
 
31
  # Define training arguments with optimized settings
32
  training_args = TrainingArguments(
33
  output_dir="./output",
@@ -37,7 +45,7 @@ training_args = TrainingArguments(
37
  gradient_accumulation_steps=8, # Adjusted for gradient accumulation
38
  save_steps=10_000,
39
  save_total_limit=2,
40
- no_cuda=False,
41
  learning_rate=3e-5, # Adjusted learning rate
42
  weight_decay=0.01,
43
  warmup_steps=100,
@@ -46,6 +54,7 @@ training_args = TrainingArguments(
46
  fp16=True, # Enable fp16 for memory and speed improvement if your hardware supports it
47
  )
48
 
 
49
  trainer = Trainer(
50
  model=model,
51
  args=training_args,
@@ -53,7 +62,9 @@ trainer = Trainer(
53
  train_dataset=tokenized_datasets["train"],
54
  )
55
 
 
56
  trainer.train()
57
 
 
58
  model.save_pretrained("fine_tuned_gpt2_model")
59
  tokenizer.save_pretrained("fine_tuned_gpt2_model")
 
2
  TrainingArguments, DataCollatorWithPadding)
3
  from datasets import load_dataset
4
 
5
+ # Load the text dataset from the specified file
6
+ dataset = load_dataset("text", data_files="training.txt")
7
 
8
+ # Initialize the GPT-2 tokenizer
9
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
10
  tokenizer.pad_token = tokenizer.eos_token
11
+
12
+ # Adjusted max_length for potentially reduced memory usage
13
+ max_length = 256
14
 
15
  def tokenize_function(examples):
16
+ # Tokenize the text to input_ids, attention_mask, and ensure labels are set
17
+ tokenized_inputs = tokenizer(
18
  examples["text"],
19
  padding="max_length",
20
  truncation=True,
21
  max_length=max_length,
22
  return_tensors="pt"
23
  )
24
+ # Prepare labels: labels are the same as input_ids for language modeling
25
+ tokenized_inputs["labels"] = tokenized_inputs["input_ids"].detach().clone()
26
+ return tokenized_inputs
27
 
28
+ # Apply tokenization to the entire dataset
29
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
 
30
  tokenized_datasets = tokenized_datasets.remove_columns(["text"])
31
  tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
32
 
33
  # Use a DataCollator that dynamically pads the batches
34
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
35
 
36
+ # Load the GPT-2 model
37
+ model = GPT2LMHeadModel.from_pretrained("gpt2")
38
+
39
  # Define training arguments with optimized settings
40
  training_args = TrainingArguments(
41
  output_dir="./output",
 
45
  gradient_accumulation_steps=8, # Adjusted for gradient accumulation
46
  save_steps=10_000,
47
  save_total_limit=2,
48
+ no_cuda=False, # Set based on your hardware
49
  learning_rate=3e-5, # Adjusted learning rate
50
  weight_decay=0.01,
51
  warmup_steps=100,
 
54
  fp16=True, # Enable fp16 for memory and speed improvement if your hardware supports it
55
  )
56
 
57
+ # Initialize the Trainer with the training dataset including labels and data collator for dynamic padding
58
  trainer = Trainer(
59
  model=model,
60
  args=training_args,
 
62
  train_dataset=tokenized_datasets["train"],
63
  )
64
 
65
+ # Start the training process
66
  trainer.train()
67
 
68
+ # Save the fine-tuned model and tokenizer
69
  model.save_pretrained("fine_tuned_gpt2_model")
70
  tokenizer.save_pretrained("fine_tuned_gpt2_model")