In [94]:
from datasets import load_dataset
# load dataset from data.jsonl file:
eli5 = load_dataset("json", data_files="data3.jsonl", split="train[:80%]")

Generating train split: 0 examples [00:00, ? examples/s]

In [95]:
eli5 = eli5.train_test_split(test_size=0.2)

In [96]:
eli5["train"][0]

{'text': "Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Board meeting next Tuesday at 10 AM.\nThe Details are as follows: {'datetime': '2024-03-19T10:00:00', 'description': 'Board meeting', 'location': ''}\n"}

In [97]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [98]:
eli5 = eli5.flatten()

In [99]:
eli5["train"][0]

{'text': "Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Board meeting next Tuesday at 10 AM.\nThe Details are as follows: {'datetime': '2024-03-19T10:00:00', 'description': 'Board meeting', 'location': ''}\n"}

In [100]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples])

In [101]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/49 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/13 [00:00<?, ? examples/s]

In [102]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [103]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

In [104]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [105]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [106]:
import torch

# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
    model.to(mps_device)
    print("Model moved to MPS device")

Model moved to MPS device


In [107]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.667893886566162, 'eval_runtime': 0.0262, 'eval_samples_per_second': 152.548, 'eval_steps_per_second': 38.137, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.2145514488220215, 'eval_runtime': 0.1232, 'eval_samples_per_second': 32.47, 'eval_steps_per_second': 8.118, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.993268966674805, 'eval_runtime': 0.0204, 'eval_samples_per_second': 196.346, 'eval_steps_per_second': 49.087, 'epoch': 3.0}
{'train_runtime': 1.588, 'train_samples_per_second': 7.556, 'train_steps_per_second': 1.889, 'train_loss': 6.412024815877278, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=6.412024815877278, metrics={'train_runtime': 1.588, 'train_samples_per_second': 7.556, 'train_steps_per_second': 1.889, 'train_loss': 6.412024815877278, 'epoch': 3.0})

In [110]:
prompt =  "Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Let's meet for lunch tomorrow at 12 PM at the Italian restaurant on Main Street.\nThe Details are as follows: {"
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=200)
generator(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Let's meet for lunch tomorrow at 12 PM at the Italian restaurant on Main Street.\nThe Details are as follows: { if (is_empty(the_time()) : return next_day_long(_.length(this); } } }\nThe Time is: 12 PM on Sunday 12th at the Italian restaurant on Main Street.\nTaste: 12 PM on Sunday 8th at the Italian restaurant on Main Street.\nThe Time is: 11 PM on Monday 9th at the Italian restaurant on Main Street.\nThe Time is: 11 AM on Monday 9th at the Italian restaurant on Main Street.\nTaste: 11 AM on Sunday 8th at the Italian restaurant on Main Street.\nThe Time is: 11 AM on Monday 9th at the Italian restaurant on Main Street.\nThe Time is: 11 AM"}]

In [None]:
from transformers import AutoTokenizer