from math import inf # from utils import * from h11 import Data import torch import torch.nn as nn import numpy as np import torch.utils import torch.utils.data from torch.utils.data import DataLoader, Dataset # from utils import MyDataset, custom_collate from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence import wandb import torch.nn.functional as F import einops import pandas as pd from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import GPT2TokenizerFast import os os.environ["TOKENIZERS_PARALLELISM"] = "false" import re class CNNDataset(Dataset): def __init__(self, df, max_length = 1000, max_len=21000, test_ds=False): super().__init__() self.df = df self.max_len = max_len self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") self.max_length = max_length self.test_ds = test_ds self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) self.tokenizer.add_special_tokens({'cls_token': '[START]'}) self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0] self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0] self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0] print(len(self.tokenizer), self.start, "Pad") for index in range(max_len): x, y = self.df['article'][index], self.df['highlights'][index] x, y = re.sub(r'[\t\n\r]', ' ', x) , re.sub(r'[\t\n\r]', ' ', y) y = self.tokenizer(y, return_tensors="pt", max_length=256,truncation=True).input_ids[0] x = self.tokenizer(x, return_tensors="pt", max_length=self.max_length-max(y.shape[0], 256+24), truncation=True).input_ids[0] self.df.loc[index, 'article'], self.df.loc[index, 'highlights'] = x,y def __len__(self, ): return self.max_len def __getitem__(self, index): x, y = self.df['article'][index], self.df['highlights'][index] # Check if middle self.eot is needed # print(x, self.eot) if self.test_ds: return torch.cat([self.eot, x, self.start]), torch.cat([y, self.eot]) x = torch.cat([self.eot, x, self.start, y, self.eot]) y = torch.cat([y, self.eot]) y_final = torch.ones(x.shape[0], dtype=torch.long) y_final[-y.shape[0]-1:-1] = y y_final[:-y.shape[0]-1] = self.pad return x, y_final def properly_pad(context): lenghts = [] # print(context) for i in context: lenghts.append(i.shape[0]) lenghts = torch.tensor(lenghts) ind = torch.argsort(lenghts, descending=True) lenghts = lenghts[ind] sorted_tensors = [context[i] for i in ind] context = sorted_tensors context = pad_sequence(sequences=context, batch_first=True, padding_value=50257) return context def custom_collate(batch): # print(batch) context, target = [], [] # print(batch) for a,b in batch: context.append(a) target.append(b) context, target = properly_pad(context), properly_pad(target) return context, target def import_data(bs=4, fraction=0.1): df_train = pd.read_csv('./cnn_dailymail/train.csv') df_val = pd.read_csv('./cnn_dailymail/validation.csv') df_test = pd.read_csv('./cnn_dailymail/test.csv') print('Loaded data') df_train, df_val, df_test = CNNDataset(df_train, max_len=int(21000*fraction)), CNNDataset(df_val, max_len=int(fraction*6000)), CNNDataset(df_test, max_len=int(fraction*300), test_ds=True) df_train = DataLoader(df_train, batch_size=bs, num_workers=7, collate_fn=custom_collate) df_test = DataLoader(df_test, batch_size=1, num_workers=7, collate_fn=custom_collate) df_val = DataLoader(df_val, batch_size=bs, num_workers=7, collate_fn=custom_collate) # print(df_train['article'][0]) return df_train, df_val, df_test if __name__ == '__main__': tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") tokenizer.add_special_tokens({'cls_token': '[START]'}) eot =tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0] pad =tokenizer("[PAD]", return_tensors="pt").input_ids[0] start =tokenizer("[START]", return_tensors="pt").input_ids[0] print(tokenizer.decode([1, 2, 50256])) print(tokenizer.decode([1, 2, 50257])) print(tokenizer('[START]')) # dl_train, dl_val, dl_test = import_data() # for x,y in dl_train: # print(x.shape, y.shape) # break