# Installing Dependencies

In [1]:
!pip install transformers datasets torch accelerate huggingface_hub

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Import Libraries

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, load_metric

# Import Model

In [34]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.save_pretrained("./gpt2-evy")



('./gpt2-evy/tokenizer_config.json',
 './gpt2-evy/special_tokens_map.json',
 './gpt2-evy/vocab.json',
 './gpt2-evy/merges.txt',
 './gpt2-evy/added_tokens.json')

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("joshcarp/evy-dataset")

train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

def tokenize_function(examples):
    inputs = tokenizer(examples['contents'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': inputs['input_ids']}

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

In [17]:
device

device(type='cuda')

# Train Model

In [36]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./gpt2-evy',
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=100,
    logging_dir='./logs',
    push_to_hub=True,
    # resume_from_checkpoint="./gpt2-evy/checkpoint-1900"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.553022
2,No log,1.460105
3,No log,1.395301
4,No log,1.355699
5,No log,1.330109
6,No log,1.311717
7,No log,1.296761
8,No log,1.283209
9,No log,1.276882
10,No log,1.28068


TrainOutput(global_step=140, training_loss=0.9502299581255231, metrics={'train_runtime': 38.6687, 'train_samples_per_second': 27.93, 'train_steps_per_second': 3.621, 'total_flos': 282195394560000.0, 'train_loss': 0.9502299581255231, 'epoch': 20.0})

# Save Model

In [37]:
trainer.save_model()
trainer.push_to_hub()
tokenizer.push_to_hub("gpt2-evy")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/joshcarp/gpt2-evy/commit/c95869e260cd381e108a4f2ad1c6f3745a001a68', commit_message='Upload tokenizer', commit_description='', oid='c95869e260cd381e108a4f2ad1c6f3745a001a68', pr_url=None, pr_revision=None, pr_num=None)