kyrylokumar
/

gpt2-finetuned

Model card Files Files and versions Community

kyrylokumar commited on Oct 29, 2024

Commit

4d898ee

verified ·

1 Parent(s): da989cb

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +42 -0
__pycache__/main.cpython-310.pyc +0 -0
__pycache__/main1.cpython-310.pyc +0 -0
__pycache__/main2.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
cnn_dailymail/test.csv +3 -0
cnn_dailymail/train.csv +3 -0
cnn_dailymail/validation.csv +3 -0
last_layer.py +399 -0
main.py +370 -0
main1.py +425 -0
main2.py +511 -0
model0.bin +3 -0
model1.bin +3 -0
model2.bin +3 -0
newspaper-text-summarization-cnn-dailymail.zip +3 -0
utils.py +124 -0
wandb/debug-internal.log +22 -0
wandb/debug.log +27 -0
wandb/run-20241028_085547-mga40p7t/files/code/main.py +124 -0
wandb/run-20241028_085547-mga40p7t/files/config.yaml +69 -0
wandb/run-20241028_085547-mga40p7t/files/output.log +16 -0
wandb/run-20241028_085547-mga40p7t/files/requirements.txt +178 -0
wandb/run-20241028_085547-mga40p7t/files/wandb-metadata.json +60 -0
wandb/run-20241028_085547-mga40p7t/files/wandb-summary.json +1 -0
wandb/run-20241028_085547-mga40p7t/logs/debug-core.log +14 -0
wandb/run-20241028_085547-mga40p7t/logs/debug-internal.log +22 -0
wandb/run-20241028_085547-mga40p7t/logs/debug.log +27 -0
wandb/run-20241028_085547-mga40p7t/run-mga40p7t.wandb +0 -0
wandb/run-20241028_085806-owcrwbil/files/code/main.py +124 -0
wandb/run-20241028_085806-owcrwbil/files/config.yaml +69 -0
wandb/run-20241028_085806-owcrwbil/files/output.log +16 -0
wandb/run-20241028_085806-owcrwbil/files/requirements.txt +178 -0
wandb/run-20241028_085806-owcrwbil/files/wandb-metadata.json +60 -0
wandb/run-20241028_085806-owcrwbil/files/wandb-summary.json +1 -0
wandb/run-20241028_085806-owcrwbil/logs/debug-core.log +13 -0
wandb/run-20241028_085806-owcrwbil/logs/debug-internal.log +22 -0
wandb/run-20241028_085806-owcrwbil/logs/debug.log +27 -0
wandb/run-20241028_085806-owcrwbil/run-owcrwbil.wandb +0 -0
wandb/run-20241028_090044-f9fzz8iy/files/code/main.py +124 -0
wandb/run-20241028_090044-f9fzz8iy/files/config.yaml +51 -0
wandb/run-20241028_090044-f9fzz8iy/files/output.log +15 -0
wandb/run-20241028_090044-f9fzz8iy/files/requirements.txt +178 -0
wandb/run-20241028_090044-f9fzz8iy/files/wandb-metadata.json +60 -0
wandb/run-20241028_090044-f9fzz8iy/files/wandb-summary.json +1 -0
wandb/run-20241028_090044-f9fzz8iy/logs/debug-core.log +13 -0
wandb/run-20241028_090044-f9fzz8iy/logs/debug-internal.log +22 -0
wandb/run-20241028_090044-f9fzz8iy/logs/debug.log +27 -0
wandb/run-20241028_090044-f9fzz8iy/run-f9fzz8iy.wandb +0 -0
wandb/run-20241028_090149-4jbvn26d/files/code/main.py +124 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,45 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cnn_dailymail/test.csv filter=lfs diff=lfs merge=lfs -text
+cnn_dailymail/train.csv filter=lfs diff=lfs merge=lfs -text
+cnn_dailymail/validation.csv filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_090149-4jbvn26d/run-4jbvn26d.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_121715-5kjpxsew/run-5kjpxsew.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_121925-nc6un2i3/run-nc6un2i3.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_123711-98pjnrqo/run-98pjnrqo.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_130604-p34k5wnh/run-p34k5wnh.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_164041-fs40r39w/run-fs40r39w.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_165506-q960l24p/run-q960l24p.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_173226-2kgb0f9e/run-2kgb0f9e.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_183140-jbr9b02q/run-jbr9b02q.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_184435-8g9y4qy1/run-8g9y4qy1.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_191117-1eqkbgu2/run-1eqkbgu2.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_193646-r4qgqj1u/run-r4qgqj1u.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_195038-eakkycgw/run-eakkycgw.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_201220-6xkcnm4u/run-6xkcnm4u.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_202354-zr7kt8eh/run-zr7kt8eh.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_210645-j0itro1g/run-j0itro1g.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_210725-ho9p1f0s/run-ho9p1f0s.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_212229-ovcu6nj3/run-ovcu6nj3.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_212304-uq1xozlm/run-uq1xozlm.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_213435-qow8er7m/run-qow8er7m.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_213754-m1kaqlt1/run-m1kaqlt1.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_221407-dv4g6q0z/run-dv4g6q0z.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_221423-pxyf2xri/run-pxyf2xri.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_222813-k1xslrgl/run-k1xslrgl.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_223604-ucawfmok/run-ucawfmok.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_224822-nlehsykg/run-nlehsykg.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241028_225822-xuv6uhuc/run-xuv6uhuc.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_130621-0asef00f/run-0asef00f.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_134007-8a5jhu4s/run-8a5jhu4s.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_141057-uk43a4xl/run-uk43a4xl.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_151508-ya0e0d5g/run-ya0e0d5g.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_155952-fb5ojuk9/run-fb5ojuk9.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_182402-3dknsv44/run-3dknsv44.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_182613-mibkz7zt/run-mibkz7zt.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_183624-haor84lw/run-haor84lw.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_190201-8uotupup/run-8uotupup.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_190305-legb7y4v/run-legb7y4v.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_192824-grmmhjzz/run-grmmhjzz.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241029_193507-ujpie5pz/run-ujpie5pz.wandb filter=lfs diff=lfs merge=lfs -text

__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (8.91 kB). View file

__pycache__/main1.cpython-310.pyc ADDED Viewed

Binary file (9.74 kB). View file

__pycache__/main2.cpython-310.pyc ADDED Viewed

Binary file (10.8 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.03 kB). View file

cnn_dailymail/test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69e091606539b415f768de75dd026bcee37b35b5b50b6088d0a6d0e017559d29
+size 49890690

cnn_dailymail/train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd4ba100d4da4c5fe5414a590a5eca9cb47494e044c179a3aadb96cead676ab7
+size 1262015264

cnn_dailymail/validation.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97dedb1d6fd51f94f74e1e9dfd3d7e175fc6b4f4417d9c1cdc33abd0b5fe54a1
+size 57691847

last_layer.py ADDED Viewed

	@@ -0,0 +1,399 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+from torchmetrics.text.rouge import ROUGEScore
+def top_p_sampling(logits, p=0.9, temperature=0.5):
+    # Apply temperature scaling
+    logits = logits / temperature
+    # Sort logits and get cumulative probabilities
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    # Create a mask for probabilities above the threshold
+    sorted_indices_to_remove = cumulative_probs > p
+    # Shift the indices to the right to keep also the smallest p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    # Scatter sorted indices to original indices with mask
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    logits[indices_to_remove] = float('-inf')  # Set unwanted logits to -inf
+    # Sample from the remaining logits
+    probs = F.softmax(logits, dim=-1)
+    sampled_indices = torch.multinomial(probs, num_samples=1)
+    sampled_indices = sampled_indices.squeeze(1)
+    return sampled_indices
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'pad_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+    # @torch.compile
+    def forward(self, X, y):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        # mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        # labels = torch.where(y == 50257, -100, y)
+        # ignore  = torch.ones([X.shape[0], self.num_prompts], dtype=torch.long, device=X.device)*-100
+        # labels = torch.cat([ignore, labels], dim=1)
+        out = self.model(inputs_embeds = embeddings)
+        # print("Out.loss:", out.loss)
+        logits = out.logits[:,self.num_prompts:]
+        return logits
+    def generate_new(self, X):
+        batch_size = X.shape[0]
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X)
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(batch_size, 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        generated_ids = torch.tensor([], dtype=torch.long, device=X.device).view(batch_size, 0) # Store all generated tokens
+        while cnt < 196:
+            out = self.model(inputs_embeds=embeddings, use_cache=True, past_key_values=past_key_values)
+            past_key_values = out.past_key_values
+            # print(cnt)
+            if cnt == 0:
+                logits = out.logits[:, self.num_prompts:]
+            else:
+                logits = out.logits
+            logits[:, :, 50257:] = -1e4  # Apply after slicing for correct dimensions
+            next_token_ids = top_p_sampling(logits[:, -1, :])
+             # next_token_ids will have shape (batch_size,)
+            print(next_token_ids.shape)
+            exit()
+            generated_ids = torch.cat([generated_ids, next_token_ids.unsqueeze(-1)], dim=-1)
+            embeddings = self.model.transformer.wte(next_token_ids)  # Correctly obtains embeddings for current batch
+            cnt += 1
+            #Check if all sequences have reached the <end> token
+            if torch.all((generated_ids == self.eot.item()).any(dim=-1)): # Check each sequence independently
+                break
+        return generated_ids
+    def generate(self, X):
+        # Only bs = 1
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:, None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+class LMModel(nn.Module):
+    def __init__(self, num_prompts=0):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'pad_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.lm_head.requires_grad_(True)
+    # @torch.compile
+    def forward(self, X, y):
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        logits = self.model(inputs_embeds = embeddings).logits
+        return logits
+    def generate(self, X):
+        # Only bs = 1
+        # self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        # embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+def zero_after_x(tensor, x):
+    """
+    Zeros out all elements in each row of a 2D tensor after the first occurrence of x.
+    Args:
+        tensor: The input 2D tensor.
+        x: The value after which to zero out elements.
+    Returns:
+        A new tensor with elements zeroed out after x.
+    """
+    mask = (tensor == x).cumsum(dim=1) > 0  # Create a cumulative mask
+    result = tensor.where(~mask, torch.ones_like(tensor, dtype=torch.long)*x) #zero out where mask is True
+    return result
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, lr=1e-4, temperature):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.model.temperature = temperature
+        tokenize_to_strings = lambda  text: self.model.tokenizer.convert_ids_to_tokens(self.model.tokenizer(text)["input_ids"])
+        self.rouge = ROUGEScore(tokenizer=tokenize_to_strings)
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[1], y[1]):
+        #     print(i.item(),j.item())
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        # print(loss)
+        # prob = logits.softmax(dim=-1)[0,-300:]
+        # target = y[0, -300:]
+        # print('logits',logits[0,-300:][torch.arange(target.numel()), target])
+        # print(prob[torch.arange(target.numel()), target])
+        # print(prob.argmax(dim=-1), target, X[0, -300:])
+        # print(self.model.tokenizer.decode(prob.argmax(dim=-1)), 'gap', self.model.tokenizer.decode(target))
+        # print(self.model.tokenizer.decode(X[0,-300:]))
+        # x = F.cross_entropy(logits[0,:-1,:].reshape(-1, logits.shape[-1]), target=y[0,:-1].reshape(-1), ignore_index=50257, reduction='none')
+        # print(x[-20:])
+        # print(self.model.pad)
+        # print(X[0, -300:].shape, target.shape, prob.argmax(dim=-1).shape)
+        # for i,j,k in zip(X[0, -300:], target, prob.argmax(dim=-1)):
+        #     print(self.model.tokenizer.decode(i),'\tx ',self.model.tokenizer.decode(j),'\tx ',self.model.tokenizer.decode(k))
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def on_test_epoch_start(self, ):
+        self.all_text = []
+        self.predicted_text = []
+    def test_step(self, batch, batch_idx):
+        if batch_idx == 0:
+            return
+        X, y = batch
+        # print(self.model.tokenizer.batch_decode(X))
+        # print(X.shape)
+        # with torch.no_grad()
+        out = self.model.generate(X)
+        # out = zero_after_x(out, self.model.eot.item())
+        # print(out.shape, y.shape)
+        # print(out, y)
+        pred = self.model.tokenizer.batch_decode(out, skip_special_tokens=True)
+        gt = self.model.tokenizer.batch_decode(y, skip_special_tokens=True)
+        print(pred)
+        print('GAP')
+        print(gt)
+        final_score = 0
+        for p,g in zip(pred, gt):
+            score = self.rouge(p, g, )
+            print(score)
+        # exit()
+            self.log_dict(score, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=25, fraction=0.1)
+    # gpt_model = PromptTuningModel(num_prompts=12)
+    gpt_model = LMModel(num_prompts=0)
+    # gpt_model = torch.compile(gpt_model)
+    model = LitModelPromptTuning(
+        model=gpt_model,
+        lr=1e-4,
+        temperature=0.9,
+        epoch = 10
+        )
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # limit_train_batches=1,
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=[2],
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='bf16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=5,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger,
+        # detect_anomaly=True,
+    )
+    # trainer.test(model, dataloaders=dl_test)
+    trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)
+    trainer.test(model, dataloaders=dl_test)

main.py ADDED Viewed

	@@ -0,0 +1,370 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+from torchmetrics.text.rouge import ROUGEScore
+def top_p_sampling(logits, p=0.9, temperature=0.5):
+    # Apply temperature scaling
+    logits = logits / temperature
+    # Sort logits and get cumulative probabilities
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    # Create a mask for probabilities above the threshold
+    sorted_indices_to_remove = cumulative_probs > p
+    # Shift the indices to the right to keep also the smallest p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    # Scatter sorted indices to original indices with mask
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    logits[indices_to_remove] = float('-inf')  # Set unwanted logits to -inf
+    # Sample from the remaining logits
+    probs = F.softmax(logits, dim=-1)
+    sampled_indices = torch.multinomial(probs, num_samples=1)
+    sampled_indices = sampled_indices.squeeze(1)
+    return sampled_indices
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.generation_config.max_new_tokens = 256
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+        # self.model.transformer.wte.weight[self.start].requires_grad = True
+    # @torch.compile
+    def forward(self, X, y):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        # mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        # labels = torch.where(y == 50257, -100, y)
+        # ignore  = torch.ones([X.shape[0], self.num_prompts], dtype=torch.long, device=X.device)*-100
+        # labels = torch.cat([ignore, labels], dim=1)
+        out = self.model(inputs_embeds = embeddings)
+        # print("Out.loss:", out.loss)
+        logits = out.logits[:,self.num_prompts:]
+        return logits
+    def generate_new(self, X):
+        batch_size = X.shape[0]
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X)
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(batch_size, 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        generated_ids = torch.tensor([], dtype=torch.long, device=X.device).view(batch_size, 0) # Store all generated tokens
+        while cnt < 196:
+            out = self.model(inputs_embeds=embeddings, use_cache=True, past_key_values=past_key_values)
+            past_key_values = out.past_key_values
+            # print(cnt)
+            if cnt == 0:
+                logits = out.logits[:, self.num_prompts:]
+            else:
+                logits = out.logits
+            logits[:, :, 50257:] = -1e4  # Apply after slicing for correct dimensions
+            next_token_ids = top_p_sampling(logits[:, -1, :])
+             # next_token_ids will have shape (batch_size,)
+            print(next_token_ids.shape)
+            exit()
+            generated_ids = torch.cat([generated_ids, next_token_ids.unsqueeze(-1)], dim=-1)
+            embeddings = self.model.transformer.wte(next_token_ids)  # Correctly obtains embeddings for current batch
+            cnt += 1
+            #Check if all sequences have reached the <end> token
+            if torch.all((generated_ids == self.eot.item()).any(dim=-1)): # Check each sequence independently
+                break
+        return generated_ids
+    def generate(self, X):
+        # Only bs = 1
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+class LMModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'pad_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.lm_head.requires_grad_(True)
+    # @torch.compile
+    def forward(self, X):
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        logits = self.model(inputs_embeds = embeddings).logits
+        return logits
+def zero_after_x(arr, x):
+    """
+    Zeros out all elements in each row of a 2D tensor after the first occurrence of x.
+    Args:
+        tensor: The input 2D tensor.
+        x: The value after which to zero out elements.
+    Returns:
+        A new tensor with elements zeroed out after x.
+    """
+    mask = (arr == x).cumsum(dim=1) > 0  # Create a cumulative mask
+    result = torch.where(mask, x, arr) #zero out where mask is True
+    return result
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, temperature, epoch, lr=1e-4):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.model.temperature = temperature
+        self.epoch = epoch
+        self.temperature = temperature
+        tokenize_to_strings = lambda  text: self.model.tokenizer.convert_ids_to_tokens(self.model.tokenizer(text)["input_ids"])
+        self.rouge = ROUGEScore(tokenizer=tokenize_to_strings)
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[1], y[1]):
+        #     print(i.item(),j.item())
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        # print(loss)
+        # prob = logits.softmax(dim=-1)[0,-300:]
+        # target = y[0, -300:]
+        # print('logits',logits[0,-300:][torch.arange(target.numel()), target])
+        # print(prob[torch.arange(target.numel()), target])
+        # print(prob.argmax(dim=-1), target, X[0, -300:])
+        # print(self.model.tokenizer.decode(prob.argmax(dim=-1)), 'gap', self.model.tokenizer.decode(target))
+        # print(self.model.tokenizer.decode(X[0,-300:]))
+        # x = F.cross_entropy(logits[0,:-1,:].reshape(-1, logits.shape[-1]), target=y[0,:-1].reshape(-1), ignore_index=50257, reduction='none')
+        # print(x[-20:])
+        # print(self.model.pad)
+        # print(X[0, -300:].shape, target.shape, prob.argmax(dim=-1).shape)
+        # for i,j,k in zip(X[0, -300:], target, prob.argmax(dim=-1)):
+        #     print(self.model.tokenizer.decode(i),'\tx ',self.model.tokenizer.decode(j),'\tx ',self.model.tokenizer.decode(k))
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def on_test_epoch_start(self, ):
+        self.all_text = []
+        self.predicted_text = []
+    def test_step(self, batch, batch_idx):
+        if batch_idx == 0:
+            return
+        X, y = batch
+        # print(self.model.tokenizer.batch_decode(X))
+        # print(X.shape)
+        # with torch.no_grad()
+        out = self.model.generate(X)
+        # out = zero_after_x(out, self.model.eot.to(X.device))
+        # print(out.shape, y.shape)
+        # print(out, y)
+        pred = self.model.tokenizer.batch_decode(out, skip_special_tokens=False)
+        gt = self.model.tokenizer.batch_decode(y, skip_special_tokens=False)
+        print(pred)
+        print('GAP')
+        print(gt)
+        final_score = 0
+        for p,g in zip(pred, gt):
+            score = self.rouge(p, g, )
+            print(score)
+        # exit()
+            self.log_dict(score, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    train = False
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=24, fraction=1)
+    if train:
+        gpt_model = PromptTuningModel(num_prompts=24)
+        gpt_model = torch.compile(gpt_model)
+    else:
+        gpt_model = torch.load('./model0.bin')
+    # gpt_model = LMModel(num_prompts=12)
+    model = LitModelPromptTuning(
+        model=gpt_model,
+        lr=1e-3,
+        temperature=0.9,
+        epoch = 5
+        )
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # limit_train_batches=1,
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=1,
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='bf16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=5,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger,
+        # detect_anomaly=True,
+    )
+    # trainer.test(model, dataloaders=dl_test)
+    if train:
+        trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)
+        trainer.test(model, dataloaders=dl_test)
+        torch.save(model.model, './model0.bin')
+    else:
+        trainer.test(model, dataloaders=dl_test)

main1.py ADDED Viewed

	@@ -0,0 +1,425 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+from torchmetrics.text.rouge import ROUGEScore
+def top_p_sampling(logits, p=0.9, temperature=0.5):
+    # Apply temperature scaling
+    logits = logits / temperature
+    # Sort logits and get cumulative probabilities
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    # Create a mask for probabilities above the threshold
+    sorted_indices_to_remove = cumulative_probs > p
+    # Shift the indices to the right to keep also the smallest p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    # Scatter sorted indices to original indices with mask
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    logits[indices_to_remove] = float('-inf')  # Set unwanted logits to -inf
+    # Sample from the remaining logits
+    probs = F.softmax(logits, dim=-1)
+    sampled_indices = torch.multinomial(probs, num_samples=1)
+    sampled_indices = sampled_indices.squeeze(1)
+    return sampled_indices
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.generation_config.max_new_tokens = 256
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+        # self.model.transformer.wte.weight[self.start].requires_grad = True
+    # @torch.compile
+    def forward(self, X, y):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        # mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        # labels = torch.where(y == 50257, -100, y)
+        # ignore  = torch.ones([X.shape[0], self.num_prompts], dtype=torch.long, device=X.device)*-100
+        # labels = torch.cat([ignore, labels], dim=1)
+        out = self.model(inputs_embeds = embeddings)
+        # print("Out.loss:", out.loss)
+        logits = out.logits[:,self.num_prompts:]
+        return logits
+    def generate_new(self, X):
+        batch_size = X.shape[0]
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X)
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(batch_size, 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        generated_ids = torch.tensor([], dtype=torch.long, device=X.device).view(batch_size, 0) # Store all generated tokens
+        while cnt < 196:
+            out = self.model(inputs_embeds=embeddings, use_cache=True, past_key_values=past_key_values)
+            past_key_values = out.past_key_values
+            # print(cnt)
+            if cnt == 0:
+                logits = out.logits[:, self.num_prompts:]
+            else:
+                logits = out.logits
+            logits[:, :, 50257:] = -1e4  # Apply after slicing for correct dimensions
+            next_token_ids = top_p_sampling(logits[:, -1, :])
+             # next_token_ids will have shape (batch_size,)
+            print(next_token_ids.shape)
+            exit()
+            generated_ids = torch.cat([generated_ids, next_token_ids.unsqueeze(-1)], dim=-1)
+            embeddings = self.model.transformer.wte(next_token_ids)  # Correctly obtains embeddings for current batch
+            cnt += 1
+            #Check if all sequences have reached the <end> token
+            if torch.all((generated_ids == self.eot.item()).any(dim=-1)): # Check each sequence independently
+                break
+        return generated_ids
+    def generate(self, X):
+        # Only bs = 1
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+class LMModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.generation_config.max_new_tokens = 256
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        self.model.lm_head.requires_grad_(True)
+    # @torch.compile
+    def forward(self, X, y):
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        logits = self.model(inputs_embeds = embeddings).logits
+        return logits
+    def generate(self, X):
+        # Only bs = 1
+        # self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        # embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+def zero_after_x(arr, x):
+    """
+    Zeros out all elements in each row of a 2D tensor after the first occurrence of x.
+    Args:
+        tensor: The input 2D tensor.
+        x: The value after which to zero out elements.
+    Returns:
+        A new tensor with elements zeroed out after x.
+    """
+    mask = (arr == x).cumsum(dim=1) > 0  # Create a cumulative mask
+    result = torch.where(mask, x, arr) #zero out where mask is True
+    return result
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, temperature, epoch, lr=1e-4, **kwargs):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.model.temperature = temperature
+        self.epoch = epoch
+        self.temperature = temperature
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        tokenize_to_strings = lambda  text: self.model.tokenizer.convert_ids_to_tokens(self.model.tokenizer(text)["input_ids"])
+        self.rouge = ROUGEScore(tokenizer=tokenize_to_strings)
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[1], y[1]):
+        #     print(i.item(),j.item())
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        # print(loss)
+        # prob = logits.softmax(dim=-1)[0,-300:]
+        # target = y[0, -300:]
+        # print('logits',logits[0,-300:][torch.arange(target.numel()), target])
+        # print(prob[torch.arange(target.numel()), target])
+        # print(prob.argmax(dim=-1), target, X[0, -300:])
+        # print(self.model.tokenizer.decode(prob.argmax(dim=-1)), 'gap', self.model.tokenizer.decode(target))
+        # print(self.model.tokenizer.decode(X[0,-300:]))
+        # x = F.cross_entropy(logits[0,:-1,:].reshape(-1, logits.shape[-1]), target=y[0,:-1].reshape(-1), ignore_index=50257, reduction='none')
+        # print(x[-20:])
+        # print(self.model.pad)
+        # print(X[0, -300:].shape, target.shape, prob.argmax(dim=-1).shape)
+        # for i,j,k in zip(X[0, -300:], target, prob.argmax(dim=-1)):
+        #     print(self.model.tokenizer.decode(i),'\tx ',self.model.tokenizer.decode(j),'\tx ',self.model.tokenizer.decode(k))
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def on_test_epoch_start(self, ):
+        self.all_text = []
+        self.predicted_text = []
+    def test_step(self, batch, batch_idx):
+        if batch_idx == 0:
+            return
+        X, y = batch
+        # print(self.model.tokenizer.batch_decode(X))
+        # print(X.shape)
+        # with torch.no_grad()
+        out = self.model.generate(X)
+        # out = zero_after_x(out, self.model.eot.to(X.device))
+        # print(out.shape, y.shape)
+        # print(out, y)
+        pred = self.model.tokenizer.batch_decode(out, skip_special_tokens=False)
+        gt = self.model.tokenizer.batch_decode(y, skip_special_tokens=False)
+        print(pred)
+        print('GAP')
+        print(gt)
+        final_score = 0
+        for p,g in zip(pred, gt):
+            score = self.rouge(p, g, )
+            print(score)
+        # exit()
+            self.log_dict(score, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    train = False
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=24, fraction=1)
+    # gpt_model = PromptTuningModel(num_prompts=24)
+    if train:
+        gpt_model = LMModel(num_prompts=12)
+        gpt_model = torch.compile(gpt_model)
+    else:
+        gpt_model = torch.load('./model1.bin')
+    model = LitModelPromptTuning(
+        model=gpt_model,
+        lr=1e-4,
+        temperature=0.9,
+        epoch = 5,
+        type_model = 'lm_head'
+        )
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # limit_train_batches=1,
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=1,
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='bf16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=5,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger,
+        # detect_anomaly=True,
+    )
+    if train:
+        trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)
+        trainer.test(model, dataloaders=dl_test)
+        torch.save(model.model, './model1.bin')
+    else:
+        trainer.test(model, dataloaders=dl_test)

main2.py ADDED Viewed

	@@ -0,0 +1,511 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+from torchmetrics.text.rouge import ROUGEScore
+def top_p_sampling(logits, p=0.9, temperature=0.5):
+    # Apply temperature scaling
+    logits = logits / temperature
+    # Sort logits and get cumulative probabilities
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    # Create a mask for probabilities above the threshold
+    sorted_indices_to_remove = cumulative_probs > p
+    # Shift the indices to the right to keep also the smallest p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    # Scatter sorted indices to original indices with mask
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    logits[indices_to_remove] = float('-inf')  # Set unwanted logits to -inf
+    # Sample from the remaining logits
+    probs = F.softmax(logits, dim=-1)
+    sampled_indices = torch.multinomial(probs, num_samples=1)
+    sampled_indices = sampled_indices.squeeze(1)
+    return sampled_indices
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.generation_config.max_new_tokens = 256
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+        # self.model.transformer.wte.weight[self.start].requires_grad = True
+    # @torch.compile
+    def forward(self, X, y):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        # mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        # labels = torch.where(y == 50257, -100, y)
+        # ignore  = torch.ones([X.shape[0], self.num_prompts], dtype=torch.long, device=X.device)*-100
+        # labels = torch.cat([ignore, labels], dim=1)
+        out = self.model(inputs_embeds = embeddings)
+        # print("Out.loss:", out.loss)
+        logits = out.logits[:,self.num_prompts:]
+        return logits
+    def generate_new(self, X):
+        batch_size = X.shape[0]
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X)
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(batch_size, 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        generated_ids = torch.tensor([], dtype=torch.long, device=X.device).view(batch_size, 0) # Store all generated tokens
+        while cnt < 196:
+            out = self.model(inputs_embeds=embeddings, use_cache=True, past_key_values=past_key_values)
+            past_key_values = out.past_key_values
+            # print(cnt)
+            if cnt == 0:
+                logits = out.logits[:, self.num_prompts:]
+            else:
+                logits = out.logits
+            logits[:, :, 50257:] = -1e4  # Apply after slicing for correct dimensions
+            next_token_ids = top_p_sampling(logits[:, -1, :])
+             # next_token_ids will have shape (batch_size,)
+            print(next_token_ids.shape)
+            exit()
+            generated_ids = torch.cat([generated_ids, next_token_ids.unsqueeze(-1)], dim=-1)
+            embeddings = self.model.transformer.wte(next_token_ids)  # Correctly obtains embeddings for current batch
+            cnt += 1
+            #Check if all sequences have reached the <end> token
+            if torch.all((generated_ids == self.eot.item()).any(dim=-1)): # Check each sequence independently
+                break
+        return generated_ids
+    def generate(self, X):
+        # Only bs = 1
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+from peft import PeftModel, LoraConfig, get_peft_model
+class LoraModel(nn.Module):
+    def __init__(self, dim=8):
+        super().__init__()
+        self.num_prompts = 0
+        self.dim = dim
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.generation_config.max_new_tokens = 256
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        lora_config = LoraConfig(
+            r=dim,  # Rank of the low-rank matrices (adjust as needed)
+            lora_alpha=32,  # Scaling factor (adjust as needed)
+            target_modules=["c_attn"], # Or other target modules as needed
+            lora_dropout=0.05, # Dropout probability for LoRA layers
+            bias="none", #Bias type for LoRA.  Use "none" for no bias.
+            task_type="CAUSAL_LM" #Specify task type.
+        )
+        self.model = get_peft_model(self.model, lora_config)
+    # @torch.compile
+    def forward(self, X, y):
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        logits = self.model(inputs_embeds = embeddings).logits
+        return logits
+    def generate(self, X):
+        # Only bs = 1
+        # self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        # embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+class LMModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.generation_config.max_new_tokens = 256
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer),)
+        self.model.lm_head.requires_grad_(True)
+    # @torch.compile
+    def forward(self, X, y):
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        logits = self.model(inputs_embeds = embeddings).logits
+        return logits
+    def generate(self, X):
+        # Only bs = 1
+        # self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        # embeddings = torch.cat([self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1), embeddings], dim=1)
+        cnt = 0
+        past_key_values = None
+        final_prediction = torch.tensor([], dtype=torch.long).to(X.device)
+        while cnt < 196:
+            out = self.model(inputs_embeds = embeddings, use_cache=True, past_key_values=past_key_values)
+            # print(cnt, out.logits.shape)
+            past_key_values = out.past_key_values
+            if cnt == 0:
+                logits = out.logits[:,self.num_prompts:]
+                logits[:,:, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output.shape)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(output.shape)
+                embeddings = self.model.transformer.wte(output)
+                # print(embeddings.shape)
+            else:
+                # print(logits.shape)
+                logits = out.logits
+                logits[:, :, 50257:] = -1e4
+                output = top_p_sampling(logits[:,-1,:], temperature=self.temperature)[:,None]
+                # print(output)
+                final_prediction = torch.cat([final_prediction, output], dim=1)
+                # print(final_prediction.shape, 'final')
+                embeddings = self.model.transformer.wte(output)
+            cnt += 1
+            # print(output.shape, self.eot.shape)
+            if  torch.all((final_prediction == self.eot.item()).any(dim=-1)):
+                break
+        return final_prediction
+def zero_after_x(arr, x):
+    """
+    Zeros out all elements in each row of a 2D tensor after the first occurrence of x.
+    Args:
+        tensor: The input 2D tensor.
+        x: The value after which to zero out elements.
+    Returns:
+        A new tensor with elements zeroed out after x.
+    """
+    mask = (arr == x).cumsum(dim=1) > 0  # Create a cumulative mask
+    result = torch.where(mask, x, arr) #zero out where mask is True
+    return result
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, temperature, epoch, lr=1e-4, **kwargs):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.model.temperature = temperature
+        self.epoch = epoch
+        self.temperature = temperature
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        tokenize_to_strings = lambda  text: self.model.tokenizer.convert_ids_to_tokens(self.model.tokenizer(text)["input_ids"])
+        self.rouge = ROUGEScore(tokenizer=tokenize_to_strings)
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[1], y[1]):
+        #     print(i.item(),j.item())
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        # print(loss)
+        # prob = logits.softmax(dim=-1)[0,-300:]
+        # target = y[0, -300:]
+        # print('logits',logits[0,-300:][torch.arange(target.numel()), target])
+        # print(prob[torch.arange(target.numel()), target])
+        # print(prob.argmax(dim=-1), target, X[0, -300:])
+        # print(self.model.tokenizer.decode(prob.argmax(dim=-1)), 'gap', self.model.tokenizer.decode(target))
+        # print(self.model.tokenizer.decode(X[0,-300:]))
+        # x = F.cross_entropy(logits[0,:-1,:].reshape(-1, logits.shape[-1]), target=y[0,:-1].reshape(-1), ignore_index=50257, reduction='none')
+        # print(x[-20:])
+        # print(self.model.pad)
+        # print(X[0, -300:].shape, target.shape, prob.argmax(dim=-1).shape)
+        # for i,j,k in zip(X[0, -300:], target, prob.argmax(dim=-1)):
+        #     print(self.model.tokenizer.decode(i),'\tx ',self.model.tokenizer.decode(j),'\tx ',self.model.tokenizer.decode(k))
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X, y)
+        logits[:,:, 50257:] = -1e4
+        # print(X.shape, y.shape, logits.shape)
+        loss = F.cross_entropy(logits[:,:-1,:].reshape(-1, logits.shape[-1]), target=y[:,:-1].reshape(-1), ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def on_test_epoch_start(self, ):
+        self.all_text = []
+        self.predicted_text = []
+    def test_step(self, batch, batch_idx):
+        if batch_idx == 0:
+            return
+        X, y = batch
+        # print(self.model.tokenizer.batch_decode(X))
+        # print(X.shape)
+        # with torch.no_grad()
+        out = self.model.generate(X)
+        # out = zero_after_x(out, self.model.eot.to(X.device))
+        # print(out.shape, y.shape)
+        # print(out, y)
+        pred = self.model.tokenizer.batch_decode(out, skip_special_tokens=False)
+        gt = self.model.tokenizer.batch_decode(y, skip_special_tokens=False)
+        print(pred)
+        print('GAP')
+        print(gt)
+        final_score = 0
+        for p,g in zip(pred, gt):
+            score = self.rouge(p, g, )
+            print(score)
+        # exit()
+            self.log_dict(score, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    train = False
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=24, fraction=1)
+    # gpt_model = PromptTuningModel(num_prompts=24)
+    if train:
+        gpt_model = LoraModel(dim=16)
+    else:
+        gpt_model = torch.load('./model1.bin')
+    # gpt_model = torch.compile(gpt_model)
+    model = LitModelPromptTuning(
+        model=gpt_model,
+        lr=1e-3,
+        temperature=0.9,
+        epoch = 5,
+        type_model = 'lora',
+        dimension = 16
+        )
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # limit_train_batches=1,
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=1,
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='bf16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=5,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger,
+        # detect_anomaly=True,
+    )
+    if train:
+        trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)
+        trainer.test(model, dataloaders=dl_test)
+        torch.save(model.model, './model2.bin')
+    else:
+        trainer.test(model, dataloaders=dl_test)

model0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5411f279c6809268001997e4d5ea65eb050828c703d82a21066eb5f2370f01e3
+size 512094683

model1.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38409a7f9c3348446f062d5b616f603770976c94e0c789f7baebf28005c87674
+size 511946721

model2.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e74693de89775360e55273ca457bfc44c1396070076530fb272bf5a355aa2a2c
+size 514335129

newspaper-text-summarization-cnn-dailymail.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f9e0cad39333d1c9f8902be6d846000ef1ccd5e994ad3c42f53336270ab8611
+size 527738644

utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from math import inf
+# from utils import *
+from h11 import Data
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+from torch.utils.data import DataLoader, Dataset
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+import pandas as pd
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import GPT2TokenizerFast
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import re
+class CNNDataset(Dataset):
+    def __init__(self, df, max_length = 1000, max_len=21000, test_ds=False):
+        super().__init__()
+        self.df = df
+        self.max_len = max_len
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.max_length = max_length
+        self.test_ds = test_ds
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.add_special_tokens({'cls_token': '[START]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
+        print(len(self.tokenizer), self.start, "Pad")
+        for index in range(max_len):
+            x, y = self.df['article'][index], self.df['highlights'][index]
+            x, y = re.sub(r'[\t\n\r]', ' ', x) , re.sub(r'[\t\n\r]', ' ', y)
+            y = self.tokenizer(y, return_tensors="pt", max_length=256,truncation=True).input_ids[0]
+            x = self.tokenizer(x, return_tensors="pt", max_length=self.max_length-max(y.shape[0], 256+24), truncation=True).input_ids[0]
+            self.df.loc[index, 'article'], self.df.loc[index, 'highlights'] = x,y
+    def __len__(self, ):
+        return self.max_len
+    def __getitem__(self, index):
+        x, y = self.df['article'][index], self.df['highlights'][index]
+       # Check if middle self.eot is needed
+        # print(x, self.eot)
+        if self.test_ds:
+            return torch.cat([self.eot, x, self.start]), torch.cat([y, self.eot])
+        x = torch.cat([self.eot, x, self.start, y, self.eot])
+        y = torch.cat([y, self.eot])
+        y_final = torch.ones(x.shape[0], dtype=torch.long)
+        y_final[-y.shape[0]-1:-1] = y
+        y_final[:-y.shape[0]-1] = self.pad
+        return x, y_final
+def properly_pad(context):
+    lenghts = []
+    # print(context)
+    for i in context:
+        lenghts.append(i.shape[0])
+    lenghts = torch.tensor(lenghts)
+    ind = torch.argsort(lenghts, descending=True)
+    lenghts = lenghts[ind]
+    sorted_tensors = [context[i] for i in ind]
+    context = sorted_tensors
+    context = pad_sequence(sequences=context, batch_first=True, padding_value=50257)
+    return context
+def custom_collate(batch):
+    # print(batch)
+    context, target = [], []
+    # print(batch)
+    for a,b in batch:
+        context.append(a)
+        target.append(b)
+    context, target = properly_pad(context), properly_pad(target)
+    return context, target
+def import_data(bs=4, fraction=0.1):
+    df_train = pd.read_csv('./cnn_dailymail/train.csv')
+    df_val = pd.read_csv('./cnn_dailymail/validation.csv')
+    df_test = pd.read_csv('./cnn_dailymail/test.csv')
+    print('Loaded data')
+    df_train, df_val, df_test = CNNDataset(df_train, max_len=int(21000*fraction)), CNNDataset(df_val, max_len=int(fraction*6000)), CNNDataset(df_test, max_len=int(fraction*300), test_ds=True)
+    df_train = DataLoader(df_train, batch_size=bs, num_workers=7, collate_fn=custom_collate)
+    df_test = DataLoader(df_test, batch_size=1, num_workers=7, collate_fn=custom_collate)
+    df_val = DataLoader(df_val, batch_size=bs, num_workers=7, collate_fn=custom_collate)
+    # print(df_train['article'][0])
+    return df_train, df_val, df_test
+if __name__ == '__main__':
+    tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+    tokenizer.add_special_tokens({'cls_token': '[START]'})
+    eot =tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+    pad =tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+    start =tokenizer("[START]", return_tensors="pt").input_ids[0]
+    print(tokenizer.decode([1, 2, 50256]))
+    print(tokenizer.decode([1, 2, 50257]))
+    print(tokenizer('[START]'))
+    # dl_train, dl_val, dl_test = import_data()
+    # for x,y in dl_train:
+    #     print(x.shape, y.shape)
+    #     break

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2024-10-29T19:35:07.322742046+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-29T19:35:07.322754356+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241029_193507-ujpie5pz/logs/debug-core.log"}
+{"time":"2024-10-29T19:35:07.323074973+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-29T19:35:07.323078902+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241029_193507-ujpie5pz/logs/debug-core.log"}
+{"time":"2024-10-29T19:35:07.327151125+04:00","level":"INFO","msg":"created new stream","id":"ujpie5pz"}
+{"time":"2024-10-29T19:35:07.327170485+04:00","level":"INFO","msg":"stream: started","id":"ujpie5pz"}
+{"time":"2024-10-29T19:35:07.327189255+04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"ujpie5pz"}}
+{"time":"2024-10-29T19:35:07.327206894+04:00","level":"INFO","msg":"sender: started","stream_id":{"value":"ujpie5pz"}}
+{"time":"2024-10-29T19:35:07.327237104+04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"ujpie5pz"}}
+{"time":"2024-10-29T19:35:07.98503491+04:00","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-29T19:35:07.98717852+04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-29T19:35:07.988807925+04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
+{"time":"2024-10-29T19:37:46.533074708+04:00","level":"INFO","msg":"stream: closing","id":"ujpie5pz"}
+{"time":"2024-10-29T19:37:46.533114998+04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-29T19:37:46.534237927+04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-29T19:37:46.799687608+04:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"}
+{"time":"2024-10-29T19:37:46.799694428+04:00","level":"WARN","msg":"No source type found, not creating job artifact"}
+{"time":"2024-10-29T19:37:46.799698199+04:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-29T19:37:48.809218785+04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"ujpie5pz"}}
+{"time":"2024-10-29T19:37:48.809270345+04:00","level":"INFO","msg":"sender: closed","stream_id":{"value":"ujpie5pz"}}
+{"time":"2024-10-29T19:37:48.809270115+04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"ujpie5pz"}}
+{"time":"2024-10-29T19:37:48.809871399+04:00","level":"INFO","msg":"stream: closed","id":"ujpie5pz"}

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Configure stats pid to 1599827
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/.config/wandb/settings
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/Kyrylo/nlp/wandb/settings
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'main2.py', 'program_abspath': '/home/siddharth.tourani/Kyrylo/nlp/main2.py', 'program': '/home/siddharth.tourani/Kyrylo/nlp/main2.py'}
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_init.py:_log_setup():532] Logging user logs to ./wandb/run-20241029_193507-ujpie5pz/logs/debug.log
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_init.py:_log_setup():533] Logging internal logs to ./wandb/run-20241029_193507-ujpie5pz/logs/debug-internal.log
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_init.py:init():616] calling init triggers
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
+config: {}
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_init.py:init():666] starting backend
+2024-10-29 19:35:07,318 INFO    MainThread:1599827 [wandb_init.py:init():670] setting up manager
+2024-10-29 19:35:07,319 INFO    MainThread:1599827 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-29 19:35:07,320 INFO    MainThread:1599827 [wandb_init.py:init():678] backend started and connected
+2024-10-29 19:35:07,322 INFO    MainThread:1599827 [wandb_init.py:init():773] updated telemetry
+2024-10-29 19:35:07,323 INFO    MainThread:1599827 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
+2024-10-29 19:35:07,981 INFO    MainThread:1599827 [wandb_init.py:init():857] starting run threads in backend
+2024-10-29 19:35:08,154 INFO    MainThread:1599827 [wandb_run.py:_console_start():2459] atexit reg
+2024-10-29 19:35:08,154 INFO    MainThread:1599827 [wandb_run.py:_redirect():2307] redirect: wrap_raw
+2024-10-29 19:35:08,154 INFO    MainThread:1599827 [wandb_run.py:_redirect():2372] Wrapping output streams.
+2024-10-29 19:35:08,154 INFO    MainThread:1599827 [wandb_run.py:_redirect():2397] Redirects installed.
+2024-10-29 19:35:08,156 INFO    MainThread:1599827 [wandb_init.py:init():900] run started, returning control to user process
+2024-10-29 19:35:08,447 INFO    MainThread:1599827 [wandb_run.py:_config_callback():1388] config_cb None None {'temperature': 0.9, 'epoch': 5, 'lr': 0.001, 'type_model': 'lora', 'dimension': 16}
+2024-10-29 19:37:46,533 WARNING MsgRouterThr:1599827 [router.py:message_loop():77] message_loop has been closed

wandb/run-20241028_085547-mga40p7t/files/code/main.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer), pad_to_multiple_of=128)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+    # @torch.compile
+    def forward(self, X):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([embeddings, self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1)], dim=1)
+        mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        logits = self.model(inputs_embeds = embeddings, attention_mask=mask).logits[:,self.num_prompts:].swapaxes(1,2)
+        return logits
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, lr=1e-4):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[0], y[0]):
+        #     print(i.item(),j.item())
+        # print(self.model.pad, self.model.eot)
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        # print(loss)
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=5, fraction=0.1)
+    gpt_model = PromptTuningModel()
+    # gpt_model = torch.compile(gpt_model)
+    model = LitModelPromptTuning(model=gpt_model)
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=[3],
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=10,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger
+        # detect_anomaly=True,
+    )
+    trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)

wandb/run-20241028_085547-mga40p7t/files/config.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+_wandb:
+    value:
+        cli_version: 0.18.1
+        code_path: code/main.py
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+            - "1": Training loss_step
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": epoch
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": Validation loss_step
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": Validation loss_epoch
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": Training loss_epoch
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+        python_version: 3.10.13
+        t:
+            "1":
+                - 1
+                - 11
+                - 49
+                - 55
+                - 71
+                - 106
+            "2":
+                - 1
+                - 11
+                - 49
+                - 55
+                - 71
+                - 106
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.13
+            "5": 0.18.1
+            "6": 4.44.2
+            "8":
+                - 5
+            "12": 0.18.1
+            "13": linux-x86_64
+lr:
+    value: 0.0001

wandb/run-20241028_085547-mga40p7t/files/output.log ADDED Viewed

	@@ -0,0 +1,16 @@

+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
+  | Name  | Type              | Params | Mode
+----------------------------------------------------
+0 | model | PromptTuningModel | 124 M  | train
+----------------------------------------------------
+4.6 K     Trainable params
+124 M     Non-trainable params
+124 M     Total params
+497.922   Total estimated model params size (MB)
+1         Modules in train mode
+164       Modules in eval mode
+Epoch 1:   3%|██▌                                                                                                | 11/420 [00:02<01:47,  3.79it/s, v_num=0p7t]
+Detected KeyboardInterrupt, attempting graceful shutdown ...

wandb/run-20241028_085547-mga40p7t/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,178 @@

+jiter==0.5.0
+anyio==4.6.0
+interegular==0.3.3
+jaxlib==0.4.34
+jsonschema==4.23.0
+typing_extensions==4.12.2
+httpcore==1.0.5
+prometheus_client==0.21.0
+openai==1.51.0
+multidict==6.1.0
+six==1.16.0
+nvidia-nccl-cu12==2.20.5
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+watchfiles==0.24.0
+tqdm==4.66.5
+yarl==1.11.1
+cffi==1.17.1
+vllm==0.6.1.post2
+bleach==6.1.0
+kaggle==1.6.17
+pydantic_core==2.23.4
+lightning-utilities==0.11.7
+sentry-sdk==2.14.0
+torch==2.4.0
+aiohappyeyeballs==2.4.0
+diffusers==0.15.0
+GitPython==3.1.43
+attrs==24.2.0
+importlib_metadata==8.5.0
+transformers==4.44.2
+pillow==10.4.0
+sounddevice==0.5.1
+gguf==0.9.1
+python-dotenv==1.0.1
+async-timeout==4.0.3
+dspy-ai==2.5.3
+numpy==1.26.4
+nvidia-nvjitlink-cu12==12.6.68
+uvicorn==0.30.6
+kiwisolver==1.4.7
+partial-json-parser==0.2.1.1.post4
+pyparsing==3.1.4
+lightning==2.4.0
+structlog==24.4.0
+nvidia-curand-cu12==10.3.2.106
+setuptools==65.5.0
+webencodings==0.5.1
+nvidia-nvtx-cu12==12.1.105
+sniffio==1.3.1
+MarkupSafe==2.1.5
+vllm-flash-attn==2.6.1
+urllib3==2.2.3
+requests==2.32.3
+pycountry==24.6.1
+ujson==5.10.0
+matplotlib==3.9.2
+pydantic==2.9.2
+torchvision==0.19.0
+numba==0.60.0
+optuna==4.0.0
+opt_einsum==3.4.0
+joblib==1.4.2
+msgpack==1.1.0
+smmap==5.0.1
+filelock==3.16.1
+opencv-contrib-python==4.10.0.84
+faiss-gpu==1.7.2
+prometheus-fastapi-instrumentator==7.0.0
+rpds-py==0.20.0
+psutil==6.0.0
+colorlog==6.8.2
+nvidia-cufft-cu12==11.0.2.54
+SQLAlchemy==2.0.35
+llvmlite==0.43.0
+packaging==24.1
+exceptiongroup==1.2.2
+dill==0.3.8
+ml_dtypes==0.5.0
+pyairports==2.1.1
+scikit-learn==1.5.2
+prettytable==3.11.0
+protobuf==4.25.5
+charset-normalizer==3.3.2
+torchmetrics==1.4.2
+text-unidecode==1.3
+httpx==0.27.2
+sympy==1.13.3
+msgspec==0.18.6
+wandb==0.18.1
+backoff==2.2.1
+sentencepiece==0.2.0
+aiohttp==3.10.5
+distro==1.9.0
+lark==1.2.2
+pyarrow==17.0.0
+Mako==1.3.5
+regex==2024.9.11
+safetensors==0.4.5
+aiosignal==1.3.1
+jsonschema-specifications==2023.12.1
+cloudpickle==3.0.0
+einops==0.8.0
+ray==2.36.1
+fire==0.7.0
+pyzmq==26.2.0
+pycparser==2.22
+platformdirs==4.3.6
+click==8.1.7
+fastapi==0.115.0
+ftfy==6.3.0
+torchtext==0.18.0
+lm-format-enforcer==0.10.6
+fsspec==2024.6.1
+tzdata==2024.2
+starlette==0.38.6
+cycler==0.12.1
+py-cpuinfo==9.0.0
+h11==0.14.0
+huggingface-hub==0.25.1
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+certifi==2024.8.30
+httptools==0.6.1
+jax==0.4.34
+PyYAML==6.0.2
+xxhash==3.5.0
+idna==3.10
+xformers==0.0.27.post2
+mistral_common==1.4.3
+fonttools==4.54.0
+pip==23.0.1
+accelerate==0.34.2
+mediapipe==0.10.15
+pytorch-lightning==2.4.0
+ollama==0.3.3
+Jinja2==3.1.4
+multiprocess==0.70.16
+opencv-python==4.10.0.84
+termcolor==2.5.0
+python-dateutil==2.9.0.post0
+contourpy==1.3.0
+websockets==13.1
+frozenlist==1.4.1
+pandas==2.2.3
+networkx==3.3
+diskcache==5.6.3
+nvidia-cusolver-cu12==11.4.5.107
+flatbuffers==24.3.25
+mpmath==1.3.0
+setproctitle==1.3.3
+tokenizers==0.19.1
+scipy==1.14.1
+outlines==0.0.46
+annotated-types==0.7.0
+docker-pycreds==0.4.0
+magicattr==0.1.6
+wcwidth==0.2.13
+pytorch-metric-learning==2.6.0
+datasets==3.0.0
+gitdb==4.0.11
+lora-diffusion==0.1.7
+referencing==0.35.1
+python-slugify==8.0.4
+zipp==3.20.2
+triton==3.0.0
+absl-py==2.1.0
+threadpoolctl==3.5.0
+uvloop==0.20.0
+tiktoken==0.7.0
+pytz==2024.2
+nest-asyncio==1.6.0
+nvidia-cublas-cu12==12.1.3.1
+litellm==1.48.12
+nvidia-cuda-nvrtc-cu12==12.1.105
+greenlet==3.1.1
+alembic==1.13.3

wandb/run-20241028_085547-mga40p7t/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "os": "Linux-5.15.161-ql-generic-13.0-14-x86_64-with-glibc2.35",
+  "python": "3.10.13",
+  "startedAt": "2024-10-28T04:55:47.033649Z",
+  "program": "/home/siddharth.tourani/Kyrylo/nlp/main.py",
+  "codePath": "main.py",
+  "email": "kyrylo.shyvam@students.iiit.ac.in",
+  "root": ".",
+  "host": "gpu-08",
+  "username": "siddharth.tourani",
+  "executable": "/home/siddharth.tourani/Minimal/bin/python3",
+  "codePathLocal": "main.py",
+  "cpu_count": 128,
+  "cpu_count_logical": 256,
+  "gpu": "[NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB]",
+  "gpu_count": 4,
+  "disk": {
+    "/": {
+      "total": "1073741824",
+      "used": "21049344"
+    }
+  },
+  "memory": {
+    "total": "270256893952"
+  },
+  "cpu": {
+    "count": 128,
+    "countLogical": 256
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    }
+  ],
+  "slurm": {
+    "job_id": "68153"
+  },
+  "cudaVersion": "12.4"
+}

wandb/run-20241028_085547-mga40p7t/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"epoch":0,"Training loss_epoch":107.80526733398438,"_step":142,"_runtime":102.656591521,"Validation loss_epoch":107.9881820678711,"Training loss_step":108.02771759033203,"_timestamp":1.730091449689934e+09,"trainer/global_step":419,"Validation loss_step":108.47615051269531,"_wandb":{"runtime":105}}

wandb/run-20241028_085547-mga40p7t/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,14 @@

+{"time":"2024-10-28T08:55:46.356024757+04:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmptnuwuzf_/port-1239188.txt","pid":1239188,"debug":false,"disable-analytics":false}
+{"time":"2024-10-28T08:55:46.356050947+04:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
+{"time":"2024-10-28T08:55:46.35809675+04:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1239188}
+{"time":"2024-10-28T08:55:46.35809731+04:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36381,"Zone":""}}
+{"time":"2024-10-28T08:55:46.482841457+04:00","level":"INFO","msg":"created new connection","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:55:47.034493884+04:00","level":"INFO","msg":"connection init received","streamId":"mga40p7t","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:55:47.035343787+04:00","level":"ERROR","msg":"error creating symlink","error":"symlink /home/siddharth.tourani/.cache/wandb/logs/core-debug-20241028_085546.log wandb/run-20241028_085547-mga40p7t/logs/debug-core.log: file exists"}
+{"time":"2024-10-28T08:55:47.044292544+04:00","level":"INFO","msg":"connection init completed","streamId":"mga40p7t","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:57:32.990052624+04:00","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:57:32.990356191+04:00","level":"INFO","msg":"server is shutting down"}
+{"time":"2024-10-28T08:57:32.990407101+04:00","level":"INFO","msg":"closed connection","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:57:33.264727032+04:00","level":"ERROR","msg":"error flushing writer","err":"write tcp 127.0.0.1:36381->127.0.0.1:59774: use of closed network connection","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:57:34.392709437+04:00","level":"INFO","msg":"connection closed","id":"127.0.0.1:59774"}
+{"time":"2024-10-28T08:57:34.392722266+04:00","level":"INFO","msg":"server is closed"}

wandb/run-20241028_085547-mga40p7t/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2024-10-28T08:55:47.035188748+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-28T08:55:47.035200088+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241028_085547-mga40p7t/logs/debug-core.log"}
+{"time":"2024-10-28T08:55:47.035571516+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-28T08:55:47.035578935+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241028_085547-mga40p7t/logs/debug-core.log"}
+{"time":"2024-10-28T08:55:47.044272764+04:00","level":"INFO","msg":"created new stream","id":"mga40p7t"}
+{"time":"2024-10-28T08:55:47.044289244+04:00","level":"INFO","msg":"stream: started","id":"mga40p7t"}
+{"time":"2024-10-28T08:55:47.044306184+04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"mga40p7t"}}
+{"time":"2024-10-28T08:55:47.044329513+04:00","level":"INFO","msg":"sender: started","stream_id":{"value":"mga40p7t"}}
+{"time":"2024-10-28T08:55:47.044370663+04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"mga40p7t"}}
+{"time":"2024-10-28T08:55:47.744002406+04:00","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-28T08:55:47.747442758+04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-28T08:55:47.748927455+04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
+{"time":"2024-10-28T08:57:32.990331552+04:00","level":"INFO","msg":"stream: closing","id":"mga40p7t"}
+{"time":"2024-10-28T08:57:32.990407231+04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-28T08:57:32.992264416+04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-28T08:57:33.519506024+04:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"}
+{"time":"2024-10-28T08:57:33.519518634+04:00","level":"WARN","msg":"No source type found, not creating job artifact"}
+{"time":"2024-10-28T08:57:33.519521734+04:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-28T08:57:34.391853104+04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"mga40p7t"}}
+{"time":"2024-10-28T08:57:34.391906283+04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"mga40p7t"}}
+{"time":"2024-10-28T08:57:34.391911083+04:00","level":"INFO","msg":"sender: closed","stream_id":{"value":"mga40p7t"}}
+{"time":"2024-10-28T08:57:34.392637247+04:00","level":"INFO","msg":"stream: closed","id":"mga40p7t"}

wandb/run-20241028_085547-mga40p7t/logs/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Configure stats pid to 1239188
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/.config/wandb/settings
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/Kyrylo/nlp/wandb/settings
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'main.py', 'program_abspath': '/home/siddharth.tourani/Kyrylo/nlp/main.py', 'program': '/home/siddharth.tourani/Kyrylo/nlp/main.py'}
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_init.py:_log_setup():532] Logging user logs to ./wandb/run-20241028_085547-mga40p7t/logs/debug.log
+2024-10-28 08:55:47,029 INFO    MainThread:1239188 [wandb_init.py:_log_setup():533] Logging internal logs to ./wandb/run-20241028_085547-mga40p7t/logs/debug-internal.log
+2024-10-28 08:55:47,030 INFO    MainThread:1239188 [wandb_init.py:init():616] calling init triggers
+2024-10-28 08:55:47,030 INFO    MainThread:1239188 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
+config: {}
+2024-10-28 08:55:47,030 INFO    MainThread:1239188 [wandb_init.py:init():666] starting backend
+2024-10-28 08:55:47,030 INFO    MainThread:1239188 [wandb_init.py:init():670] setting up manager
+2024-10-28 08:55:47,030 INFO    MainThread:1239188 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-28 08:55:47,033 INFO    MainThread:1239188 [wandb_init.py:init():678] backend started and connected
+2024-10-28 08:55:47,036 INFO    MainThread:1239188 [wandb_init.py:init():773] updated telemetry
+2024-10-28 08:55:47,036 INFO    MainThread:1239188 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
+2024-10-28 08:55:47,740 INFO    MainThread:1239188 [wandb_init.py:init():857] starting run threads in backend
+2024-10-28 08:55:47,953 INFO    MainThread:1239188 [wandb_run.py:_console_start():2459] atexit reg
+2024-10-28 08:55:47,953 INFO    MainThread:1239188 [wandb_run.py:_redirect():2307] redirect: wrap_raw
+2024-10-28 08:55:47,953 INFO    MainThread:1239188 [wandb_run.py:_redirect():2372] Wrapping output streams.
+2024-10-28 08:55:47,953 INFO    MainThread:1239188 [wandb_run.py:_redirect():2397] Redirects installed.
+2024-10-28 08:55:47,957 INFO    MainThread:1239188 [wandb_init.py:init():900] run started, returning control to user process
+2024-10-28 08:55:48,061 INFO    MainThread:1239188 [wandb_run.py:_config_callback():1388] config_cb None None {'lr': 0.0001}
+2024-10-28 08:57:32,990 WARNING MsgRouterThr:1239188 [router.py:message_loop():77] message_loop has been closed

wandb/run-20241028_085547-mga40p7t/run-mga40p7t.wandb ADDED Viewed

Binary file (500 kB). View file

wandb/run-20241028_085806-owcrwbil/files/code/main.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer), pad_to_multiple_of=128)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+    # @torch.compile
+    def forward(self, X):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([embeddings, self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1)], dim=1)
+        mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        logits = self.model(inputs_embeds = embeddings, attention_mask=mask).logits[:,self.num_prompts:].swapaxes(1,2)
+        return logits
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, lr=1e-4):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[0], y[0]):
+        #     print(i.item(),j.item())
+        # print(self.model.pad, self.model.eot)
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        # print(loss)
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=20, fraction=0.1)
+    gpt_model = PromptTuningModel()
+    # gpt_model = torch.compile(gpt_model)
+    model = LitModelPromptTuning(model=gpt_model)
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=[3],
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=10,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger
+        # detect_anomaly=True,
+    )
+    trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)

wandb/run-20241028_085806-owcrwbil/files/config.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+_wandb:
+    value:
+        cli_version: 0.18.1
+        code_path: code/main.py
+        m:
+            - "1": epoch
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+            - "1": Training loss_step
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": Validation loss_step
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": Validation loss_epoch
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": Training loss_epoch
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+        python_version: 3.10.13
+        t:
+            "1":
+                - 1
+                - 11
+                - 49
+                - 55
+                - 71
+                - 106
+            "2":
+                - 1
+                - 11
+                - 49
+                - 55
+                - 71
+                - 106
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.13
+            "5": 0.18.1
+            "6": 4.44.2
+            "8":
+                - 5
+            "12": 0.18.1
+            "13": linux-x86_64
+lr:
+    value: 0.0001

wandb/run-20241028_085806-owcrwbil/files/output.log ADDED Viewed

	@@ -0,0 +1,16 @@

+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
+  | Name  | Type              | Params | Mode
+----------------------------------------------------
+0 | model | PromptTuningModel | 124 M  | train
+----------------------------------------------------
+4.6 K     Trainable params
+124 M     Non-trainable params
+124 M     Total params
+497.922   Total estimated model params size (MB)
+1         Modules in train mode
+164       Modules in eval mode
+Epoch 1:  56%|███████████████████████████████████████████████████████▋                                           | 59/105 [00:24<00:19,  2.36it/s, v_num=wbil]
+Detected KeyboardInterrupt, attempting graceful shutdown ...

wandb/run-20241028_085806-owcrwbil/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,178 @@

+jiter==0.5.0
+anyio==4.6.0
+interegular==0.3.3
+jaxlib==0.4.34
+jsonschema==4.23.0
+typing_extensions==4.12.2
+httpcore==1.0.5
+prometheus_client==0.21.0
+openai==1.51.0
+multidict==6.1.0
+six==1.16.0
+nvidia-nccl-cu12==2.20.5
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+watchfiles==0.24.0
+tqdm==4.66.5
+yarl==1.11.1
+cffi==1.17.1
+vllm==0.6.1.post2
+bleach==6.1.0
+kaggle==1.6.17
+pydantic_core==2.23.4
+lightning-utilities==0.11.7
+sentry-sdk==2.14.0
+torch==2.4.0
+aiohappyeyeballs==2.4.0
+diffusers==0.15.0
+GitPython==3.1.43
+attrs==24.2.0
+importlib_metadata==8.5.0
+transformers==4.44.2
+pillow==10.4.0
+sounddevice==0.5.1
+gguf==0.9.1
+python-dotenv==1.0.1
+async-timeout==4.0.3
+dspy-ai==2.5.3
+numpy==1.26.4
+nvidia-nvjitlink-cu12==12.6.68
+uvicorn==0.30.6
+kiwisolver==1.4.7
+partial-json-parser==0.2.1.1.post4
+pyparsing==3.1.4
+lightning==2.4.0
+structlog==24.4.0
+nvidia-curand-cu12==10.3.2.106
+setuptools==65.5.0
+webencodings==0.5.1
+nvidia-nvtx-cu12==12.1.105
+sniffio==1.3.1
+MarkupSafe==2.1.5
+vllm-flash-attn==2.6.1
+urllib3==2.2.3
+requests==2.32.3
+pycountry==24.6.1
+ujson==5.10.0
+matplotlib==3.9.2
+pydantic==2.9.2
+torchvision==0.19.0
+numba==0.60.0
+optuna==4.0.0
+opt_einsum==3.4.0
+joblib==1.4.2
+msgpack==1.1.0
+smmap==5.0.1
+filelock==3.16.1
+opencv-contrib-python==4.10.0.84
+faiss-gpu==1.7.2
+prometheus-fastapi-instrumentator==7.0.0
+rpds-py==0.20.0
+psutil==6.0.0
+colorlog==6.8.2
+nvidia-cufft-cu12==11.0.2.54
+SQLAlchemy==2.0.35
+llvmlite==0.43.0
+packaging==24.1
+exceptiongroup==1.2.2
+dill==0.3.8
+ml_dtypes==0.5.0
+pyairports==2.1.1
+scikit-learn==1.5.2
+prettytable==3.11.0
+protobuf==4.25.5
+charset-normalizer==3.3.2
+torchmetrics==1.4.2
+text-unidecode==1.3
+httpx==0.27.2
+sympy==1.13.3
+msgspec==0.18.6
+wandb==0.18.1
+backoff==2.2.1
+sentencepiece==0.2.0
+aiohttp==3.10.5
+distro==1.9.0
+lark==1.2.2
+pyarrow==17.0.0
+Mako==1.3.5
+regex==2024.9.11
+safetensors==0.4.5
+aiosignal==1.3.1
+jsonschema-specifications==2023.12.1
+cloudpickle==3.0.0
+einops==0.8.0
+ray==2.36.1
+fire==0.7.0
+pyzmq==26.2.0
+pycparser==2.22
+platformdirs==4.3.6
+click==8.1.7
+fastapi==0.115.0
+ftfy==6.3.0
+torchtext==0.18.0
+lm-format-enforcer==0.10.6
+fsspec==2024.6.1
+tzdata==2024.2
+starlette==0.38.6
+cycler==0.12.1
+py-cpuinfo==9.0.0
+h11==0.14.0
+huggingface-hub==0.25.1
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+certifi==2024.8.30
+httptools==0.6.1
+jax==0.4.34
+PyYAML==6.0.2
+xxhash==3.5.0
+idna==3.10
+xformers==0.0.27.post2
+mistral_common==1.4.3
+fonttools==4.54.0
+pip==23.0.1
+accelerate==0.34.2
+mediapipe==0.10.15
+pytorch-lightning==2.4.0
+ollama==0.3.3
+Jinja2==3.1.4
+multiprocess==0.70.16
+opencv-python==4.10.0.84
+termcolor==2.5.0
+python-dateutil==2.9.0.post0
+contourpy==1.3.0
+websockets==13.1
+frozenlist==1.4.1
+pandas==2.2.3
+networkx==3.3
+diskcache==5.6.3
+nvidia-cusolver-cu12==11.4.5.107
+flatbuffers==24.3.25
+mpmath==1.3.0
+setproctitle==1.3.3
+tokenizers==0.19.1
+scipy==1.14.1
+outlines==0.0.46
+annotated-types==0.7.0
+docker-pycreds==0.4.0
+magicattr==0.1.6
+wcwidth==0.2.13
+pytorch-metric-learning==2.6.0
+datasets==3.0.0
+gitdb==4.0.11
+lora-diffusion==0.1.7
+referencing==0.35.1
+python-slugify==8.0.4
+zipp==3.20.2
+triton==3.0.0
+absl-py==2.1.0
+threadpoolctl==3.5.0
+uvloop==0.20.0
+tiktoken==0.7.0
+pytz==2024.2
+nest-asyncio==1.6.0
+nvidia-cublas-cu12==12.1.3.1
+litellm==1.48.12
+nvidia-cuda-nvrtc-cu12==12.1.105
+greenlet==3.1.1
+alembic==1.13.3

wandb/run-20241028_085806-owcrwbil/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "os": "Linux-5.15.161-ql-generic-13.0-14-x86_64-with-glibc2.35",
+  "python": "3.10.13",
+  "startedAt": "2024-10-28T04:58:06.613389Z",
+  "program": "/home/siddharth.tourani/Kyrylo/nlp/main.py",
+  "codePath": "main.py",
+  "email": "kyrylo.shyvam@students.iiit.ac.in",
+  "root": ".",
+  "host": "gpu-08",
+  "username": "siddharth.tourani",
+  "executable": "/home/siddharth.tourani/Minimal/bin/python3",
+  "codePathLocal": "main.py",
+  "cpu_count": 128,
+  "cpu_count_logical": 256,
+  "gpu": "[NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB]",
+  "gpu_count": 4,
+  "disk": {
+    "/": {
+      "total": "1073741824",
+      "used": "21049344"
+    }
+  },
+  "memory": {
+    "total": "270256893952"
+  },
+  "cpu": {
+    "count": 128,
+    "countLogical": 256
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    }
+  ],
+  "slurm": {
+    "job_id": "68153"
+  },
+  "cudaVersion": "12.4"
+}

wandb/run-20241028_085806-owcrwbil/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_runtime":77.016630791,"Validation loss_step":107.9970703125,"Validation loss_epoch":108.69440460205078,"_timestamp":1.7300915636298473e+09,"_wandb":{"runtime":79},"epoch":1,"Training loss_step":107.52567291259766,"trainer/global_step":159,"_step":39,"Training loss_epoch":108.31751251220703}

wandb/run-20241028_085806-owcrwbil/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,13 @@

+{"time":"2024-10-28T08:58:05.852186282+04:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpbt47w5hb/port-1241278.txt","pid":1241278,"debug":false,"disable-analytics":false}
+{"time":"2024-10-28T08:58:05.852211372+04:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
+{"time":"2024-10-28T08:58:05.853103688+04:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1241278}
+{"time":"2024-10-28T08:58:05.853096439+04:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39955,"Zone":""}}
+{"time":"2024-10-28T08:58:06.04806319+04:00","level":"INFO","msg":"created new connection","id":"127.0.0.1:41868"}
+{"time":"2024-10-28T08:58:06.614533728+04:00","level":"INFO","msg":"connection init received","streamId":"owcrwbil","id":"127.0.0.1:41868"}
+{"time":"2024-10-28T08:58:06.615450814+04:00","level":"ERROR","msg":"error creating symlink","error":"symlink /home/siddharth.tourani/.cache/wandb/logs/core-debug-20241028_085805.log wandb/run-20241028_085806-owcrwbil/logs/debug-core.log: file exists"}
+{"time":"2024-10-28T08:58:06.620019897+04:00","level":"INFO","msg":"connection init completed","streamId":"owcrwbil","id":"127.0.0.1:41868"}
+{"time":"2024-10-28T08:59:25.917611765+04:00","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:41868"}
+{"time":"2024-10-28T08:59:25.917909894+04:00","level":"INFO","msg":"server is shutting down"}
+{"time":"2024-10-28T08:59:25.917970554+04:00","level":"INFO","msg":"closed connection","id":"127.0.0.1:41868"}
+{"time":"2024-10-28T08:59:32.101339343+04:00","level":"INFO","msg":"connection closed","id":"127.0.0.1:41868"}
+{"time":"2024-10-28T08:59:32.101350683+04:00","level":"INFO","msg":"server is closed"}

wandb/run-20241028_085806-owcrwbil/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2024-10-28T08:58:06.615297525+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-28T08:58:06.615309735+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241028_085806-owcrwbil/logs/debug-core.log"}
+{"time":"2024-10-28T08:58:06.615703263+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-28T08:58:06.615710223+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241028_085806-owcrwbil/logs/debug-core.log"}
+{"time":"2024-10-28T08:58:06.619998427+04:00","level":"INFO","msg":"created new stream","id":"owcrwbil"}
+{"time":"2024-10-28T08:58:06.620015787+04:00","level":"INFO","msg":"stream: started","id":"owcrwbil"}
+{"time":"2024-10-28T08:58:06.620034997+04:00","level":"INFO","msg":"sender: started","stream_id":{"value":"owcrwbil"}}
+{"time":"2024-10-28T08:58:06.620043877+04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"owcrwbil"}}
+{"time":"2024-10-28T08:58:06.620038217+04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"owcrwbil"}}
+{"time":"2024-10-28T08:58:07.277024244+04:00","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-28T08:58:07.279017506+04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-28T08:58:07.28068419+04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
+{"time":"2024-10-28T08:59:25.917904554+04:00","level":"INFO","msg":"stream: closing","id":"owcrwbil"}
+{"time":"2024-10-28T08:59:25.917952654+04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-28T08:59:25.919306298+04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-28T08:59:26.173567721+04:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"}
+{"time":"2024-10-28T08:59:26.173578511+04:00","level":"WARN","msg":"No source type found, not creating job artifact"}
+{"time":"2024-10-28T08:59:26.173581711+04:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-28T08:59:32.100682276+04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"owcrwbil"}}
+{"time":"2024-10-28T08:59:32.100731535+04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"owcrwbil"}}
+{"time":"2024-10-28T08:59:32.100765025+04:00","level":"INFO","msg":"sender: closed","stream_id":{"value":"owcrwbil"}}
+{"time":"2024-10-28T08:59:32.101266313+04:00","level":"INFO","msg":"stream: closed","id":"owcrwbil"}

wandb/run-20241028_085806-owcrwbil/logs/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Configure stats pid to 1241278
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/.config/wandb/settings
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/Kyrylo/nlp/wandb/settings
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'main.py', 'program_abspath': '/home/siddharth.tourani/Kyrylo/nlp/main.py', 'program': '/home/siddharth.tourani/Kyrylo/nlp/main.py'}
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_init.py:_log_setup():532] Logging user logs to ./wandb/run-20241028_085806-owcrwbil/logs/debug.log
+2024-10-28 08:58:06,610 INFO    MainThread:1241278 [wandb_init.py:_log_setup():533] Logging internal logs to ./wandb/run-20241028_085806-owcrwbil/logs/debug-internal.log
+2024-10-28 08:58:06,611 INFO    MainThread:1241278 [wandb_init.py:init():616] calling init triggers
+2024-10-28 08:58:06,611 INFO    MainThread:1241278 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
+config: {}
+2024-10-28 08:58:06,611 INFO    MainThread:1241278 [wandb_init.py:init():666] starting backend
+2024-10-28 08:58:06,611 INFO    MainThread:1241278 [wandb_init.py:init():670] setting up manager
+2024-10-28 08:58:06,611 INFO    MainThread:1241278 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-28 08:58:06,613 INFO    MainThread:1241278 [wandb_init.py:init():678] backend started and connected
+2024-10-28 08:58:06,615 INFO    MainThread:1241278 [wandb_init.py:init():773] updated telemetry
+2024-10-28 08:58:06,616 INFO    MainThread:1241278 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
+2024-10-28 08:58:07,272 INFO    MainThread:1241278 [wandb_init.py:init():857] starting run threads in backend
+2024-10-28 08:58:07,450 INFO    MainThread:1241278 [wandb_run.py:_console_start():2459] atexit reg
+2024-10-28 08:58:07,450 INFO    MainThread:1241278 [wandb_run.py:_redirect():2307] redirect: wrap_raw
+2024-10-28 08:58:07,450 INFO    MainThread:1241278 [wandb_run.py:_redirect():2372] Wrapping output streams.
+2024-10-28 08:58:07,450 INFO    MainThread:1241278 [wandb_run.py:_redirect():2397] Redirects installed.
+2024-10-28 08:58:07,452 INFO    MainThread:1241278 [wandb_init.py:init():900] run started, returning control to user process
+2024-10-28 08:58:07,549 INFO    MainThread:1241278 [wandb_run.py:_config_callback():1388] config_cb None None {'lr': 0.0001}
+2024-10-28 08:59:25,918 WARNING MsgRouterThr:1241278 [router.py:message_loop():77] message_loop has been closed

wandb/run-20241028_085806-owcrwbil/run-owcrwbil.wandb ADDED Viewed

Binary file (211 kB). View file

wandb/run-20241028_090044-f9fzz8iy/files/code/main.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer), pad_to_multiple_of=128)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+    # @torch.compile
+    def forward(self, X):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([embeddings, self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1)], dim=1)
+        mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        logits = self.model(inputs_embeds = embeddings, attention_mask=mask).logits[:,self.num_prompts:].swapaxes(1,2)
+        return logits
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, lr=1e-4):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[0], y[0]):
+        #     print(i.item(),j.item())
+        # print(self.model.pad, self.model.eot)
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        # print(loss)
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=25, fraction=0.1)
+    gpt_model = PromptTuningModel()
+    gpt_model = torch.compile(gpt_model)
+    model = LitModelPromptTuning(model=gpt_model)
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=[3],
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=10,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger
+        # detect_anomaly=True,
+    )
+    trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)

wandb/run-20241028_090044-f9fzz8iy/files/config.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+_wandb:
+    value:
+        cli_version: 0.18.1
+        code_path: code/main.py
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+            - "1": Training loss_step
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": epoch
+              "5": 1
+              "6":
+                - 1
+                - 3
+              "7": []
+        python_version: 3.10.13
+        t:
+            "1":
+                - 1
+                - 11
+                - 49
+                - 55
+                - 71
+                - 106
+            "2":
+                - 1
+                - 11
+                - 49
+                - 55
+                - 71
+                - 106
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.13
+            "5": 0.18.1
+            "6": 4.44.2
+            "8":
+                - 5
+            "12": 0.18.1
+            "13": linux-x86_64
+lr:
+    value: 0.0001

wandb/run-20241028_090044-f9fzz8iy/files/output.log ADDED Viewed

	@@ -0,0 +1,15 @@

+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
+  | Name  | Type              | Params | Mode
+----------------------------------------------------
+0 | model | PromptTuningModel | 124 M  | train
+----------------------------------------------------
+4.6 K     Trainable params
+124 M     Non-trainable params
+124 M     Total params
+497.922   Total estimated model params size (MB)
+1         Modules in train mode
+164       Modules in eval mode
+Epoch 0:  67%|██████████████████████████████████████████████████████████████████▋                                 | 56/84 [00:27<00:13,  2.07it/s, v_num=z8iy]
+Detected KeyboardInterrupt, attempting graceful shutdown ...

wandb/run-20241028_090044-f9fzz8iy/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,178 @@

+jiter==0.5.0
+anyio==4.6.0
+interegular==0.3.3
+jaxlib==0.4.34
+jsonschema==4.23.0
+typing_extensions==4.12.2
+httpcore==1.0.5
+prometheus_client==0.21.0
+openai==1.51.0
+multidict==6.1.0
+six==1.16.0
+nvidia-nccl-cu12==2.20.5
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+watchfiles==0.24.0
+tqdm==4.66.5
+yarl==1.11.1
+cffi==1.17.1
+vllm==0.6.1.post2
+bleach==6.1.0
+kaggle==1.6.17
+pydantic_core==2.23.4
+lightning-utilities==0.11.7
+sentry-sdk==2.14.0
+torch==2.4.0
+aiohappyeyeballs==2.4.0
+diffusers==0.15.0
+GitPython==3.1.43
+attrs==24.2.0
+importlib_metadata==8.5.0
+transformers==4.44.2
+pillow==10.4.0
+sounddevice==0.5.1
+gguf==0.9.1
+python-dotenv==1.0.1
+async-timeout==4.0.3
+dspy-ai==2.5.3
+numpy==1.26.4
+nvidia-nvjitlink-cu12==12.6.68
+uvicorn==0.30.6
+kiwisolver==1.4.7
+partial-json-parser==0.2.1.1.post4
+pyparsing==3.1.4
+lightning==2.4.0
+structlog==24.4.0
+nvidia-curand-cu12==10.3.2.106
+setuptools==65.5.0
+webencodings==0.5.1
+nvidia-nvtx-cu12==12.1.105
+sniffio==1.3.1
+MarkupSafe==2.1.5
+vllm-flash-attn==2.6.1
+urllib3==2.2.3
+requests==2.32.3
+pycountry==24.6.1
+ujson==5.10.0
+matplotlib==3.9.2
+pydantic==2.9.2
+torchvision==0.19.0
+numba==0.60.0
+optuna==4.0.0
+opt_einsum==3.4.0
+joblib==1.4.2
+msgpack==1.1.0
+smmap==5.0.1
+filelock==3.16.1
+opencv-contrib-python==4.10.0.84
+faiss-gpu==1.7.2
+prometheus-fastapi-instrumentator==7.0.0
+rpds-py==0.20.0
+psutil==6.0.0
+colorlog==6.8.2
+nvidia-cufft-cu12==11.0.2.54
+SQLAlchemy==2.0.35
+llvmlite==0.43.0
+packaging==24.1
+exceptiongroup==1.2.2
+dill==0.3.8
+ml_dtypes==0.5.0
+pyairports==2.1.1
+scikit-learn==1.5.2
+prettytable==3.11.0
+protobuf==4.25.5
+charset-normalizer==3.3.2
+torchmetrics==1.4.2
+text-unidecode==1.3
+httpx==0.27.2
+sympy==1.13.3
+msgspec==0.18.6
+wandb==0.18.1
+backoff==2.2.1
+sentencepiece==0.2.0
+aiohttp==3.10.5
+distro==1.9.0
+lark==1.2.2
+pyarrow==17.0.0
+Mako==1.3.5
+regex==2024.9.11
+safetensors==0.4.5
+aiosignal==1.3.1
+jsonschema-specifications==2023.12.1
+cloudpickle==3.0.0
+einops==0.8.0
+ray==2.36.1
+fire==0.7.0
+pyzmq==26.2.0
+pycparser==2.22
+platformdirs==4.3.6
+click==8.1.7
+fastapi==0.115.0
+ftfy==6.3.0
+torchtext==0.18.0
+lm-format-enforcer==0.10.6
+fsspec==2024.6.1
+tzdata==2024.2
+starlette==0.38.6
+cycler==0.12.1
+py-cpuinfo==9.0.0
+h11==0.14.0
+huggingface-hub==0.25.1
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+certifi==2024.8.30
+httptools==0.6.1
+jax==0.4.34
+PyYAML==6.0.2
+xxhash==3.5.0
+idna==3.10
+xformers==0.0.27.post2
+mistral_common==1.4.3
+fonttools==4.54.0
+pip==23.0.1
+accelerate==0.34.2
+mediapipe==0.10.15
+pytorch-lightning==2.4.0
+ollama==0.3.3
+Jinja2==3.1.4
+multiprocess==0.70.16
+opencv-python==4.10.0.84
+termcolor==2.5.0
+python-dateutil==2.9.0.post0
+contourpy==1.3.0
+websockets==13.1
+frozenlist==1.4.1
+pandas==2.2.3
+networkx==3.3
+diskcache==5.6.3
+nvidia-cusolver-cu12==11.4.5.107
+flatbuffers==24.3.25
+mpmath==1.3.0
+setproctitle==1.3.3
+tokenizers==0.19.1
+scipy==1.14.1
+outlines==0.0.46
+annotated-types==0.7.0
+docker-pycreds==0.4.0
+magicattr==0.1.6
+wcwidth==0.2.13
+pytorch-metric-learning==2.6.0
+datasets==3.0.0
+gitdb==4.0.11
+lora-diffusion==0.1.7
+referencing==0.35.1
+python-slugify==8.0.4
+zipp==3.20.2
+triton==3.0.0
+absl-py==2.1.0
+threadpoolctl==3.5.0
+uvloop==0.20.0
+tiktoken==0.7.0
+pytz==2024.2
+nest-asyncio==1.6.0
+nvidia-cublas-cu12==12.1.3.1
+litellm==1.48.12
+nvidia-cuda-nvrtc-cu12==12.1.105
+greenlet==3.1.1
+alembic==1.13.3

wandb/run-20241028_090044-f9fzz8iy/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "os": "Linux-5.15.161-ql-generic-13.0-14-x86_64-with-glibc2.35",
+  "python": "3.10.13",
+  "startedAt": "2024-10-28T05:00:44.986886Z",
+  "program": "/home/siddharth.tourani/Kyrylo/nlp/main.py",
+  "codePath": "main.py",
+  "email": "kyrylo.shyvam@students.iiit.ac.in",
+  "root": ".",
+  "host": "gpu-08",
+  "username": "siddharth.tourani",
+  "executable": "/home/siddharth.tourani/Minimal/bin/python3",
+  "codePathLocal": "main.py",
+  "cpu_count": 128,
+  "cpu_count_logical": 256,
+  "gpu": "[NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB, NVIDIA A100-SXM4-40GB]",
+  "gpu_count": 4,
+  "disk": {
+    "/": {
+      "total": "1073741824",
+      "used": "21049344"
+    }
+  },
+  "memory": {
+    "total": "270256893952"
+  },
+  "cpu": {
+    "count": 128,
+    "countLogical": 256
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    }
+  ],
+  "slurm": {
+    "job_id": "68153"
+  },
+  "cudaVersion": "12.4"
+}

wandb/run-20241028_090044-f9fzz8iy/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb":{"runtime":30},"_timestamp":1.7300916668905182e+09,"_runtime":21.903801157,"_step":1,"Training loss_step":104.03557586669922,"epoch":0,"trainer/global_step":39}

wandb/run-20241028_090044-f9fzz8iy/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,13 @@

+{"time":"2024-10-28T09:00:44.271373064+04:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpr43jmo4e/port-1243414.txt","pid":1243414,"debug":false,"disable-analytics":false}
+{"time":"2024-10-28T09:00:44.271400724+04:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
+{"time":"2024-10-28T09:00:44.272333799+04:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1243414}
+{"time":"2024-10-28T09:00:44.27232289+04:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36087,"Zone":""}}
+{"time":"2024-10-28T09:00:44.467138216+04:00","level":"INFO","msg":"created new connection","id":"127.0.0.1:52740"}
+{"time":"2024-10-28T09:00:44.988394668+04:00","level":"INFO","msg":"connection init received","streamId":"f9fzz8iy","id":"127.0.0.1:52740"}
+{"time":"2024-10-28T09:00:44.989450074+04:00","level":"ERROR","msg":"error creating symlink","error":"symlink /home/siddharth.tourani/.cache/wandb/logs/core-debug-20241028_090044.log wandb/run-20241028_090044-f9fzz8iy/logs/debug-core.log: file exists"}
+{"time":"2024-10-28T09:00:44.994677699+04:00","level":"INFO","msg":"connection init completed","streamId":"f9fzz8iy","id":"127.0.0.1:52740"}
+{"time":"2024-10-28T09:01:15.297081483+04:00","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:52740"}
+{"time":"2024-10-28T09:01:15.297356992+04:00","level":"INFO","msg":"server is shutting down"}
+{"time":"2024-10-28T09:01:15.297394512+04:00","level":"INFO","msg":"closed connection","id":"127.0.0.1:52740"}
+{"time":"2024-10-28T09:01:16.667813027+04:00","level":"INFO","msg":"connection closed","id":"127.0.0.1:52740"}
+{"time":"2024-10-28T09:01:16.667830647+04:00","level":"INFO","msg":"server is closed"}

wandb/run-20241028_090044-f9fzz8iy/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2024-10-28T09:00:44.989289054+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-28T09:00:44.989302624+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241028_090044-f9fzz8iy/logs/debug-core.log"}
+{"time":"2024-10-28T09:00:44.989755452+04:00","level":"INFO","msg":"using version","core version":"0.18.1"}
+{"time":"2024-10-28T09:00:44.989762312+04:00","level":"INFO","msg":"created symlink","path":"wandb/run-20241028_090044-f9fzz8iy/logs/debug-core.log"}
+{"time":"2024-10-28T09:00:44.994658889+04:00","level":"INFO","msg":"created new stream","id":"f9fzz8iy"}
+{"time":"2024-10-28T09:00:44.994674479+04:00","level":"INFO","msg":"stream: started","id":"f9fzz8iy"}
+{"time":"2024-10-28T09:00:44.994693389+04:00","level":"INFO","msg":"sender: started","stream_id":{"value":"f9fzz8iy"}}
+{"time":"2024-10-28T09:00:44.994713418+04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"f9fzz8iy"}}
+{"time":"2024-10-28T09:00:44.994695909+04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"f9fzz8iy"}}
+{"time":"2024-10-28T09:00:45.712012987+04:00","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-28T09:00:45.713870838+04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-28T09:00:45.716597735+04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
+{"time":"2024-10-28T09:01:15.297352912+04:00","level":"INFO","msg":"stream: closing","id":"f9fzz8iy"}
+{"time":"2024-10-28T09:01:15.297391541+04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-28T09:01:15.2976609+04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-28T09:01:15.570782955+04:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"}
+{"time":"2024-10-28T09:01:15.570791505+04:00","level":"WARN","msg":"No source type found, not creating job artifact"}
+{"time":"2024-10-28T09:01:15.570794545+04:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-28T09:01:16.667000401+04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"f9fzz8iy"}}
+{"time":"2024-10-28T09:01:16.667029211+04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"f9fzz8iy"}}
+{"time":"2024-10-28T09:01:16.667090251+04:00","level":"INFO","msg":"sender: closed","stream_id":{"value":"f9fzz8iy"}}
+{"time":"2024-10-28T09:01:16.667475389+04:00","level":"INFO","msg":"stream: closed","id":"f9fzz8iy"}

wandb/run-20241028_090044-f9fzz8iy/logs/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Configure stats pid to 1243414
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/.config/wandb/settings
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Loading settings from /home/siddharth.tourani/Kyrylo/nlp/wandb/settings
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'main.py', 'program_abspath': '/home/siddharth.tourani/Kyrylo/nlp/main.py', 'program': '/home/siddharth.tourani/Kyrylo/nlp/main.py'}
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_init.py:_log_setup():532] Logging user logs to ./wandb/run-20241028_090044-f9fzz8iy/logs/debug.log
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_init.py:_log_setup():533] Logging internal logs to ./wandb/run-20241028_090044-f9fzz8iy/logs/debug-internal.log
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_init.py:init():616] calling init triggers
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
+config: {}
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_init.py:init():666] starting backend
+2024-10-28 09:00:44,984 INFO    MainThread:1243414 [wandb_init.py:init():670] setting up manager
+2024-10-28 09:00:44,985 INFO    MainThread:1243414 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-28 09:00:44,986 INFO    MainThread:1243414 [wandb_init.py:init():678] backend started and connected
+2024-10-28 09:00:44,989 INFO    MainThread:1243414 [wandb_init.py:init():773] updated telemetry
+2024-10-28 09:00:44,989 INFO    MainThread:1243414 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
+2024-10-28 09:00:45,707 INFO    MainThread:1243414 [wandb_init.py:init():857] starting run threads in backend
+2024-10-28 09:00:45,884 INFO    MainThread:1243414 [wandb_run.py:_console_start():2459] atexit reg
+2024-10-28 09:00:45,884 INFO    MainThread:1243414 [wandb_run.py:_redirect():2307] redirect: wrap_raw
+2024-10-28 09:00:45,884 INFO    MainThread:1243414 [wandb_run.py:_redirect():2372] Wrapping output streams.
+2024-10-28 09:00:45,884 INFO    MainThread:1243414 [wandb_run.py:_redirect():2397] Redirects installed.
+2024-10-28 09:00:45,885 INFO    MainThread:1243414 [wandb_init.py:init():900] run started, returning control to user process
+2024-10-28 09:00:45,983 INFO    MainThread:1243414 [wandb_run.py:_config_callback():1388] config_cb None None {'lr': 0.0001}
+2024-10-28 09:01:15,297 WARNING MsgRouterThr:1243414 [router.py:message_loop():77] message_loop has been closed

wandb/run-20241028_090044-f9fzz8iy/run-f9fzz8iy.wandb ADDED Viewed

Binary file (60.2 kB). View file

wandb/run-20241028_090149-4jbvn26d/files/code/main.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from math import inf
+# from utils import *
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.utils
+import torch.utils.data
+# from utils import MyDataset, custom_collate
+from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
+import wandb
+import torch.nn.functional as F
+import einops
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2TokenizerFast
+np.random.seed(123)
+torch.manual_seed(123)
+torch.cuda.random.manual_seed(123)
+import lightning as L
+import utils
+class PromptTuningModel(nn.Module):
+    def __init__(self, num_prompts=6):
+        super().__init__()
+        self.num_prompts = num_prompts
+        self.model = AutoModelForCausalLM.from_pretrained("gpt2", )
+        self.model.requires_grad_(False)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
+        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
+        self.model.resize_token_embeddings(new_num_tokens = len(self.tokenizer), pad_to_multiple_of=128)
+        tmp = self.tokenizer('summarise', return_tensors="pt").input_ids
+        token_embedding = self.model.transformer.wte(tmp[0])
+        self.token_embedding = token_embedding
+        for _ in range(num_prompts//3-1):
+            self.token_embedding = torch.cat([self.token_embedding, token_embedding])
+        # print(self.token_embedding.shape)
+        data = torch.zeros(num_prompts, 768) + self.token_embedding[:]
+        self.learnable_prompt = nn.Parameter(data, requires_grad=True)
+    # @torch.compile
+    def forward(self, X):
+        self.learnable_prompt = self.learnable_prompt.to(X.device)
+        embeddings = self.model.transformer.wte(X, ) # b s d
+        embeddings = torch.cat([embeddings, self.learnable_prompt[None, :, :].repeat(X.shape[0], 1, 1)], dim=1)
+        mask = torch.cat([torch.ones([X.shape[0],self.num_prompts], dtype=torch.long).to(X), torch.where(X != self.pad.to(X.device), 1, 0)], dim=1)
+        # print(mask.shape)
+        logits = self.model(inputs_embeds = embeddings, attention_mask=mask).logits[:,self.num_prompts:].swapaxes(1,2)
+        return logits
+class LitModelPromptTuning(L.LightningModule):
+    def __init__(self, model, lr=1e-4):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.save_hyperparameters(ignore=['model'])
+    def training_step(self, batch, batch_idx):
+        X, y = batch
+        # for i,j in zip(X[0], y[0]):
+        #     print(i.item(),j.item())
+        # print(self.model.pad, self.model.eot)
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        # print(loss)
+        # exit()
+        self.log('Training loss', loss, on_step=True, on_epoch=True,logger=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        X, y = batch
+        logits = self.model(X)
+        loss = F.cross_entropy(logits, target=y, ignore_index=50257)
+        self.log('Validation loss', loss, on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
+        return optimizer
+from lightning.pytorch.loggers import WandbLogger
+if __name__ == '__main__':
+    torch.set_float32_matmul_precision('medium')
+    dl_train, dl_val, dl_test = utils.import_data(bs=25, fraction=0.1)
+    gpt_model = PromptTuningModel()
+    gpt_model = torch.compile(gpt_model)
+    model = LitModelPromptTuning(model=gpt_model)
+    print('Training')
+    logger = WandbLogger(project='Anlp-3')
+    trainer = L.Trainer(
+        accelerator='gpu',
+        # strategy='auto',
+        # strategy=pl.strategies.DDPStrategy(find_unused_parameters=True),
+        devices=[3],
+        default_root_dir=f'./logs/', # Tensorflow can be used to viz
+        num_nodes=1,
+        num_sanity_val_steps=1, # runs a validation step before stating training
+        precision='16-mixed', # we use half precision to reduce  memory usage
+        max_epochs=10,
+        check_val_every_n_epoch=1, # run validation every epoch
+        log_every_n_steps=20,
+        logger=logger
+        # detect_anomaly=True,
+    )
+    trainer.fit(model, train_dataloaders=dl_train, val_dataloaders=dl_val)