In [1]:
import numpy as np
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

In [2]:
# Load the DailyDialog dataset
dataset = load_dataset('daily_dialog')

# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_utterances(example):
    example['dialog'] = " ".join(example['dialog'])
    return example

# Apply the function to all examples in the dataset
dataset = dataset.map(concatenate_utterances)

In [3]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')

In [4]:
# Encode the dataset
def encode(examples):
    encoded = tokenizer(examples['dialog'], truncation=True, padding='max_length', max_length=128)
    encoded['labels'] = encoded['input_ids'][:]
    return encoded

encoded_dataset = dataset.map(encode, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='model',              # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation']
)

In [6]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['validation'])

# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# Fine-tune the model
trainer.train()

  0%|          | 0/348 [00:00<?, ?it/s]

{'train_runtime': 25354.0984, 'train_samples_per_second': 0.877, 'train_steps_per_second': 0.014, 'train_loss': 2.2603482651984557, 'epoch': 2.0}


TrainOutput(global_step=348, training_loss=2.2603482651984557, metrics={'train_runtime': 25354.0984, 'train_samples_per_second': 0.877, 'train_steps_per_second': 0.014, 'train_loss': 2.2603482651984557, 'epoch': 2.0})

In [8]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))

# Evaluate after fine-tuning
post_eval_results = trainer.evaluate(encoded_dataset['validation'])

# Print the evaluation losses before and after fine-tuning
print('Evaluation Results before fine-tuning :', pre_eval_results['eval_loss'])
print('Evaluation Results after fine-tuning  :', post_eval_results['eval_loss'])

# Get predictions for validation set before fine tuning for 10 samples
post_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))

# Zip the pre and post tuning predictions
predictions = zip(pre_val_predictions.predictions, post_val_predictions.predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Evaluation Results before fine-tuning : 4.766543388366699
Evaluation Results after fine-tuning  : 1.8690917491912842


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
for idx, (pre, post) in enumerate(predictions):
    pre_pred = tokenizer.decode(np.argmax(pre, axis=-1), skip_special_tokens=True)
    post_pred = tokenizer.decode(np.argmax(post, axis=-1), skip_special_tokens=True)
    ground_truth = encoded_dataset['validation'][idx]["dialog"]
    
    print('Ground truth \n' + ground_truth + '\n')
    print('Pre-prediction \n' + pre_pred + '\n')
    print('Post-prediction \n'+ post_pred + '\n')
    print('----------------------------------------------------------------------------------------------------------------------\n')

Ground truth 
Good morning , sir . Is there a bank near here ?   There is one . 5 blocks away from here ?   Well , that's too far.Can you change some money for me ?   Surely , of course . What kind of currency have you got ?   RIB .   How much would you like to change ?   1000 Yuan.Here you are . 

Pre-prediction 
 and, sir.  there anything problem here here?   Yes is. in, away. here.   Yes, I's a far.How you tell the money for me?   Sure. sir course. Here's of money do you got?   IIB.   Here much is you like to exchange?   I R.How you are.   

Post-prediction 
 and, sir.  there anything problem here here?   Yes is. in, away. here.   Yes, I's a far.How you tell the money for me?   Sure. sir course. Here's of money do you got?   IIB.   Here much is you like to exchange?   I R.How you are.   

----------------------------------------------------------------------------------------------------------------------

Ground truth 
Good afternoon . This is Michelle Li speaking , calling on beha

In [13]:
tokenizer.save_pretrained("saved_model")
model.save_pretrained("saved_model")