In [None]:
!pip install transformers




In [None]:
!pip install sentencepiece




In [None]:
!pip install pytorch_lightning




In [None]:
from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import AdamW
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn.utils.rnn import pad_sequence
# from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

pl.seed_everything(100)
import warnings
warnings.filterwarnings("ignore")

INFO:lightning_fabric.utilities.seed:Seed set to 100


In [None]:
data = pd.read_csv("Conversation.csv")
data.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
print("No of rows:" ,data.shape[0])

No of rows: 3725


In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_MAX_LEN = 128 #input length
OUTPUT_MAX_LEN = 128 # output length
TRAIN_BATCH_SIZE = 8 # batch size of training
VAL_BATCH_SIZE = 2 # batch size for validation
EPOCHS = 5 # number of epoch

In [None]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=512)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
text = "Hello, how are you today?"    # assume the text that is to be tokenized

input_tokenize = tokenizer(
             text,
            add_special_tokens=True,        #Add Special tokens like [CLS] and [SEP]
            max_length=128,
            padding = 'max_length',         #for padding to max_length for equal sequence length
            truncation = True,              #truncate the text if it is greater than max_length
            return_attention_mask=True,     #will return attention mask
            return_tensors="pt"             #return tensor formate
        )

In [None]:
print("input_ids: ", input_tokenize['input_ids'].flatten())
print("-----------------------------------------------------------------------------")
print("Attention Mask: ", input_tokenize['attention_mask'].flatten())

input_ids:  tensor([8774,    6,  149,   33,   25,  469,   58,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
-----------------------------------------------------------------------------
Attention Mask:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0

In [None]:
class T5Dataset:

  def __init__(self,question,answer):

    self.question = question
    self.answer = answer
    self.tokenizer = tokenizer
    self.input_max_len = INPUT_MAX_LEN
    self.output_max_len = OUTPUT_MAX_LEN

  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.question)

  def __getitem__(self,item):             # This method retrieves the item at the specified index item.

    question = str(self.question[item])
    question = ''.join(question.split())

    answer = str(self.answer[item])
    answer = ''.join(answer.split())

    input_tokenize = self.tokenizer(
            question,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            answer,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"

        )


    input_ids = input_tokenize["input_ids"].flatten()
    attention_mask = input_tokenize["attention_mask"].flatten()
    labels = output_tokenize['input_ids'].flatten()

    out = {
            'question':question,
            'answer':answer,
            'input_ids': input_ids,
            'attention_mask':attention_mask,
            'target':labels
        }

    return out

In [None]:
class T5DataLoad(pl.LightningDataModule):

    def __init__(self,df_train,df_test):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN

    def setup(self, stage=None):

        self.train_data = T5Dataset(
            question = self.df_train.question.values,
            answer = self.df_train.answer.values
        )

        self.valid_data = T5Dataset(
            question = self.df_test.question.values,
            answer = self.df_test.answer.values
        )
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
         self.train_data,
         batch_size= TRAIN_BATCH_SIZE,
         shuffle=True,
         num_workers=2
        )
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
        self.valid_data,
        batch_size= VAL_BATCH_SIZE,
        num_workers = 2
        )

In [None]:
class T5Model(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)


    def forward(self, input_ids, attention_mask, labels=None):

        output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids , attention_mask, labels)


        self.log("train_loss", loss, prog_bar=True, logger=True)

        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["target"]
        loss, logits = self(input_ids, attention_mask, labels)

        self.log("val_loss", loss, prog_bar=True, logger=True)

        return {'val_loss': loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [None]:
!pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio==0.9.0+cu102 -f https://download.pytorch.org/whl/cu102/torch_stable.html
!pip install pytorch-lightning==1.4.9


Looking in links: https://download.pytorch.org/whl/cu102/torch_stable.html
[31mERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 1.11.0, 1.11.0+cu102, 1.12.0, 1.12.0+cu102, 1.12.1, 1.12.1+cu102, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.9.0+cu102[0m[31m
[0mCollecting pytorch-lightning==1.4.9
  Downloading pytorch_lightning-1.4.9-py3-none-any.whl (925 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.8/925.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyDeprecate==0.3.1 (from pytorch-lightning==1.4.9)
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Installing collected packages: pyDeprecate, pytorch-lightning
  Attempting uninstall: pytorch-lightning
    Found existing installation: pytorch-lightning 2.1.1
    Uninstalling pytorch-lightning-2.1.1:
      Successfully uninstalled pytorch-lightning-2.1.1
Successfully installed pyDep

In [None]:

def run():
    df_train, df_test = train_test_split(data, test_size=0.2, random_state=100)
    dataload = T5DataLoad(df_train, df_test)
    dataload.setup()
    device = DEVICE
    model = T5Model()
    model.to(device)

    checkpoint = ModelCheckpoint(
        dirpath="/kaggle/working",
        filename='best-model',
        save_top_k=2,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    trainer = pl.Trainer(
        callbacks=[checkpoint],  # Using a list of callbacks
        max_epochs=1,
        gpus=1,
        accelerator="gpu"
    )
    trainer.fit(model, dataload)

run()


TypeError: ignored

In [None]:
train_model = T5Model.load_from_checkpoint('/kaggle/working/best-model.ckpt')
train_model.freeze()

def generate_question(question):

    inputs_encoding =  tokenizer(
        question,
        add_special_tokens=True,
        max_length= INPUT_MAX_LEN,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )


    generate_ids = train_model.model.generate(
        input_ids = inputs_encoding["input_ids"],
        attention_mask = inputs_encoding["attention_mask"],
        max_length = INPUT_MAX_LEN,
        num_beams = 4,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)

In [None]:
ques = "hi, how are you doing?"
print("Ques: ",ques)
print("BOT: ",generate_question(ques))

In [None]:
ques = "how's it going?"
print("Ques: ",ques)
print("BOT: ",generate_question(ques))

In [None]:
ques = "i heard that it's going to be warm this weekend."
print("Ques: ",ques)
print("BOT: ",generate_question(ques))