Corrigan123 commited on
Commit
c610360
1 Parent(s): f387efe

Update app.py with optimized training settings

Browse files
Files changed (1) hide show
  1. app.py +16 -27
app.py CHANGED
@@ -1,5 +1,4 @@
1
- from transformers import (GPT2Tokenizer, GPT2LMHeadModel, Trainer,
2
- TrainingArguments, DataCollatorWithPadding)
3
  from datasets import load_dataset
4
 
5
  # Load the text dataset from the specified file
@@ -9,54 +8,44 @@ dataset = load_dataset("text", data_files="training.txt")
9
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
10
  tokenizer.pad_token = tokenizer.eos_token
11
 
12
- # Adjusted max_length for potentially reduced memory usage
13
- max_length = 256
14
-
15
  def tokenize_function(examples):
16
- # Tokenize the text to input_ids, attention_mask, and ensure labels are set
17
  tokenized_inputs = tokenizer(
18
  examples["text"],
19
  padding="max_length",
20
  truncation=True,
21
- max_length=max_length,
22
  )
23
  # Prepare labels: labels are the same as input_ids for language modeling
24
- tokenized_inputs["labels"] = tokenized_inputs["input_ids"].detach().clone()
25
  return tokenized_inputs
26
 
27
- # Apply tokenization to the entire dataset and remove 'text' column during mapping
28
- tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
29
- # No need to call remove_columns on tokenized_datasets again as 'text' is already removed
30
-
31
- # Use a DataCollator that dynamically pads the batches
32
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
33
 
34
  # Load the GPT-2 model
35
  model = GPT2LMHeadModel.from_pretrained("gpt2")
36
 
37
- # Define training arguments with optimized settings
38
  training_args = TrainingArguments(
39
  output_dir="./output",
40
  overwrite_output_dir=True,
41
- num_train_epochs=3,
42
- per_device_train_batch_size=2, # Decreased batch size
43
- gradient_accumulation_steps=8, # Adjusted for gradient accumulation
44
  save_steps=10_000,
45
  save_total_limit=2,
46
- no_cuda=False, # Set based on your hardware
47
- learning_rate=3e-5, # Adjusted learning rate
48
- weight_decay=0.01,
49
- warmup_steps=100,
50
- logging_dir='./logs',
51
- logging_steps=100,
52
- fp16=True, # Enable fp16 for memory and speed improvement if your hardware supports it
53
  )
54
 
55
- # Initialize the Trainer
56
  trainer = Trainer(
57
  model=model,
58
  args=training_args,
59
- data_collator=data_collator,
60
  train_dataset=tokenized_datasets["train"],
61
  )
62
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
 
2
  from datasets import load_dataset
3
 
4
  # Load the text dataset from the specified file
 
8
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
9
  tokenizer.pad_token = tokenizer.eos_token
10
 
11
+ # Define a function to tokenize the dataset and prepare labels
 
 
12
  def tokenize_function(examples):
13
+ # Tokenize the text to input_ids, attention_mask, with reduced max_length
14
  tokenized_inputs = tokenizer(
15
  examples["text"],
16
  padding="max_length",
17
  truncation=True,
18
+ max_length=256 # Reduced from 512 to 256
19
  )
20
  # Prepare labels: labels are the same as input_ids for language modeling
21
+ tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
22
  return tokenized_inputs
23
 
24
+ # Tokenize the entire dataset
25
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
26
+ # Remove the 'text' column as it's no longer needed after tokenization
27
+ tokenized_datasets = tokenized_datasets.remove_columns(["text"])
28
+ # Set the format of the dataset to PyTorch tensors
29
+ tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
30
 
31
  # Load the GPT-2 model
32
  model = GPT2LMHeadModel.from_pretrained("gpt2")
33
 
34
+ # Define training arguments with adjusted settings
35
  training_args = TrainingArguments(
36
  output_dir="./output",
37
  overwrite_output_dir=True,
38
+ num_train_epochs=2, # Optionally reduced for quicker iteration
39
+ per_device_train_batch_size=2, # Reduced from 4 to 2
40
+ gradient_accumulation_steps=8, # Added to compensate for smaller batch size
41
  save_steps=10_000,
42
  save_total_limit=2,
 
 
 
 
 
 
 
43
  )
44
 
45
+ # Initialize the Trainer with the training dataset including labels
46
  trainer = Trainer(
47
  model=model,
48
  args=training_args,
 
49
  train_dataset=tokenized_datasets["train"],
50
  )
51