In [52]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="data-flattened.json", split="train")

labels = ["datetime", "description", "location"]
dataset = dataset.train_test_split(test_size=0.1)


In [60]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TextClassificationPipeline,
    TrainingArguments,
)

# Model and tokenizer selection
checkpoint = "google-t5/t5-small"  # Ensure correct model name


# Configure model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=len(labels)
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess_function(examples):
    inputs = [doc for doc in examples["message"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data_set = dataset.map(preprocess_function, batched=True)

# Training setup (assuming you have data in optimal JSON format)
training_args = TrainingArguments(
    output_dir="calendar_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    use_mps_device=True,
    # fp16=True,
    # push_to_hub=True,
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()

# Create pipeline for multi-label prediction
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, labels=labels)

# Example usage for multi-label prediction
text = "Meeting with John at 2 pm tomorrow in the conference room"
calendar_entry = pipe(text)

print(calendar_entry)  # Output will be a list of dictionaries, one per label

# Example: Accessing scores for the "datetime" label
datetime_predictions = calendar_entry[0]
print(datetime_predictions["score"])  # List of prediction scores for "datetime"


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/69 [00:00<?, ? examples/s]


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



KeyError: 'summary'

In [6]:
tokenized_data_set = data_set.map(preprocess_function, batched=True)

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [4]:
from transformers import DataCollatorForSeq2Seq

In [5]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
import evaluate

In [7]:
rouge = evaluate.load("rouge")

In [8]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [9]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [10]:
import torch

# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
    model.to(mps_device)
    print("Model moved to MPS device")

Model moved to MPS device


In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="calendar_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    use_mps_device=True,
    # fp16=True,
    # push_to_hub=True,
)



In [12]:
print(data_set)

DatasetDict({
    train: Dataset({
        features: ['details', 'message'],
        num_rows: 69
    })
    test: Dataset({
        features: ['details', 'message'],
        num_rows: 8
    })
})


In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data_set["train"],
    eval_dataset=tokenized_data_set["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 14.770042419433594, 'eval_rouge1': 0.2492, 'eval_rouge2': 0.132, 'eval_rougeL': 0.2098, 'eval_rougeLsum': 0.2078, 'eval_gen_len': 18.5, 'eval_runtime': 3.1599, 'eval_samples_per_second': 2.532, 'eval_steps_per_second': 0.316, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 13.279829978942871, 'eval_rouge1': 0.191, 'eval_rouge2': 0.0841, 'eval_rougeL': 0.171, 'eval_rougeLsum': 0.1669, 'eval_gen_len': 18.5, 'eval_runtime': 0.6868, 'eval_samples_per_second': 11.648, 'eval_steps_per_second': 1.456, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 12.672184944152832, 'eval_rouge1': 0.1767, 'eval_rouge2': 0.0792, 'eval_rougeL': 0.1555, 'eval_rougeLsum': 0.1518, 'eval_gen_len': 19.0, 'eval_runtime': 0.6063, 'eval_samples_per_second': 13.195, 'eval_steps_per_second': 1.649, 'epoch': 3.0}
{'train_runtime': 12.159, 'train_samples_per_second': 17.024, 'train_steps_per_second': 1.234, 'train_loss': 12.712192789713542, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=12.712192789713542, metrics={'train_runtime': 12.159, 'train_samples_per_second': 17.024, 'train_steps_per_second': 1.234, 'train_loss': 12.712192789713542, 'epoch': 3.0})

In [15]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/joshcarp/calendar_model/commit/ef13304ccc7e109ab97007e944f01405ce9b1409', commit_message='End of training', commit_description='', oid='ef13304ccc7e109ab97007e944f01405ce9b1409', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
from transformers import pipeline

hub_model_id = "joshcarp/calendar_model"
summarizer = pipeline("textclassificationpipeline", model=hub_model_id)
text = "convert to summary: Doctor's appointment on Friday at 9:00 AM."
summary = summarizer(text, max_length=50, min_length=6)
print(text)
print(summary)

convert to summary: Doctor's appointment on Friday at 9:00 AM.
[{'generated_text': "Umgekehrt: Doctor's appointment on Friday at 9:00 AM."}]
