niksapraljak1
/

BioM3

Model card Files Files and versions Community

Niksa Praljak commited on Dec 19, 2024

Commit

c865888

1 Parent(s): 03b411e

Add scripts for ProteoScribe Sampling

Browse files

Files changed (14) hide show

Stage3_source/DSEma.py +43 -0
Stage3_source/PL_wrapper.py +433 -0
Stage3_source/__init__.py +0 -0
Stage3_source/animation_tools.py +65 -0
Stage3_source/cond_diff_transformer_layer.py +260 -0
Stage3_source/diff_transformer_layer.py +263 -0
Stage3_source/eval_metrics.py +412 -0
Stage3_source/helper_funcs.py +32 -0
Stage3_source/preprocess.py +200 -0
Stage3_source/sampling_analysis.py +276 -0
Stage3_source/transformer_sampling_helper.py +12 -0
Stage3_source/transformer_training_helper.py +557 -0
run_ProteoScribe_sample.py +167 -0
stage3_config.json +62 -0

Stage3_source/DSEma.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from torch import nn
+import torch
+from deepspeed.runtime.zero import GatheredParameters
+import deepspeed
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+def _z3_params_to_fetch(param_list):
+    return [
+        p for p in param_list
+        if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE
+    ]
+def moving_average(model, model_ema, beta=0.9999, device=None, zero_stage=3):
+    zero_stage_3 = (zero_stage == 3)
+    with torch.no_grad():
+        for param, param_ema in zip(model.parameters(),
+                                    model_ema.parameters()):
+            # TODO: use prefiltering for efficiency
+            params_to_fetch = _z3_params_to_fetch([param, param_ema
+                                                   ]) if zero_stage_3 else []
+            should_gather_param = len(params_to_fetch) > 0
+            with deepspeed.zero.GatheredParameters(
+                    params_to_fetch, enabled=should_gather_param):
+                data = param.data
+                if device is not None:
+                    data = data.to(device)
+                #print('real model',data.shape, data)
+                #print('ema model',param_ema.shape, param_ema.data)
+                param_ema.data.copy_(torch.lerp(data, param_ema.data, beta))
+                #print('after ema copy',param_ema.shape, param_ema.data)
+def clone_zero_model(src_model, dst_model, zero_stage=0):
+    zero_stage_3 = (zero_stage == 3)
+    with torch.no_grad():
+        for src_param, dst_param in zip(src_model.parameters(), dst_model.parameters()):
+            # TODO: use prefiltering for efficiency
+            params_to_fetch = _z3_params_to_fetch([src_param, dst_param
+                                                   ]) if zero_stage_3 else []
+            should_gather_param = len(params_to_fetch) > 0
+            with deepspeed.zero.GatheredParameters(params_to_fetch, enabled=should_gather_param):
+                dst_param.data.copy_(src_param.data)

Stage3_source/PL_wrapper.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import torch
+from torch import nn, optim
+from torch.nn import functional as F
+from torch.distributions import OneHotCategorical
+from transformers.optimization import Adafactor
+# PL functions
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.callbacks import EarlyStopping
+import functools
+import math
+#from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.wrap import (
+        size_based_auto_wrap_policy,
+        enable_wrap,
+        wrap
+)
+import deepspeed
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+from sklearn.model_selection import train_test_split
+from Stage3_source.DSEma import moving_average, clone_zero_model
+import Stage3_source.transformer_training_helper as trainer_tools
+import Stage3_source.helper_funcs as helper_tools
+import Stage3_source.eval_metrics as eval_funcs
+import Stage3_source.preprocess as prep
+import copy
+from torch.utils.data import DataLoader
+import pandas as pd
+from transformers import get_cosine_schedule_with_warmup
+class PL_ProtARDM(pl.LightningModule):
+    def __init__(
+            self,
+            args: any,
+            model: nn.Module,
+            #ema_model: nn.Module,
+        ):
+        super().__init__()
+        #self.save_hyperparameters()
+        # arguments
+        self.script_args = args
+        # the whole model
+        self.model = model
+        #self.ema_model = ema_model
+        #clone_zero_model(self.model, self.ema_model, zero_stage=3)
+        ##self.ema_model = copy.deepcopy(self.model)
+    def forward(
+            self,
+            x: torch.Tensor,
+            t: torch.Tensor,
+            y_c: torch.Tensor,
+            ema=False,
+        ) -> torch.Tensor:
+        if ema:
+            logits = self.ema_model(x=x, t=t.view(-1,), y_c=y_c)
+        else:
+            logits = self.model(x=x, t=t.view(-1,), y_c=y_c)
+        return logits
+        #return F.softmax(logits, dim=1)
+    #def on_train_batch_end(self, *args, **kwargs):
+    #    clone_zero_model(self.model, self.ema_model, zero_stage=3)
+    #    #moving_average(self.model, self.ema_model, beta=0.0, zero_stage=3)
+    def configure_optimizers(self, ):
+        if self.script_args.choose_optim == 'AdamW':
+            if isinstance(self, FSDP):
+                print("Enter FSDP")
+                optimizer = torch.optim.AdamW(self.parameters(), lr=self.script_args.lr, weight_decay=self.script_args.weight_decay)
+            else:
+                optimizer = torch.optim.AdamW(self.parameters(), lr=self.script_args.lr, weight_decay=self.script_args.weight_decay)
+        elif self.script_args.choose_optim == 'AdaFactor':
+            optimizer = Adafactor(self.parameters(), lr=self.script_args.lr, weight_decay=self.script_args.weight_decay, relative_step=False)
+        elif self.script_args.choose_optim == 'Adam':
+            optimizer = torch.optim.Adam(self.parameters(), lr=self.script_args.lr)
+        elif self.script_args.choose_optim == 'DeepSpeedCPUAdam':
+            optimizer = DeepSpeedCPUAdam(self.parameters(), lr=self.script_args.lr, weight_decay=self.script_args.weight_decay)
+        if self.script_args.scheduler_gamma is not None:
+            if isinstance(self.script_args.scheduler_gamma, str):
+                if 'coswarmup' == self.script_args.scheduler_gamma.lower():
+                    print(f'Using cossine warmup scheduler with decay')
+                    num_warmup_steps=self.script_args.traindata_len
+                    num_training_steps=self.script_args.traindata_len*self.script_args.epochs
+                    print(f'Num_warmup_steps={num_warmup_steps}')
+                    print(f'Num_training_steps={num_training_steps}')
+                    def _get_cosine_schedule_with_warmup_lr_lambda(
+                        current_step: int, num_warmup_steps: int, num_training_steps: int, num_cycles: float
+                    ):
+                        if current_step < num_warmup_steps:
+                            return float(current_step) / float(max(1, num_warmup_steps))
+                        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+                        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+                    lr_lambda = functools.partial(
+                        _get_cosine_schedule_with_warmup_lr_lambda,
+                        num_warmup_steps=num_warmup_steps,
+                        num_training_steps=num_training_steps,
+                        num_cycles=0.5,
+                    )
+                    return {
+                        "optimizer": optimizer,
+                        "lr_scheduler": {
+                            "scheduler": optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1),
+                            "interval": "step",
+                        },
+                    }
+                    #return {
+                    #    "optimizer": optimizer,
+                    #    "lr_scheduler": {
+                    #        "scheduler": get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps),
+                    #        "interval": "step",
+                    #    },
+                    #}
+            else:
+                print(f'Using Exponential learning rate decay / epoch with factor: {self.script_args.scheduler_gamma}')
+                return {
+                    "optimizer": optimizer,
+                    "lr_scheduler": {
+                        "scheduler": optim.lr_scheduler.ExponentialLR(optimizer, gamma=self.script_args.scheduler_gamma, verbose=True),
+                        "interval": "epoch",
+                    },
+                }
+        else:
+            return optimizer
+        #else:
+        #    print("Please make choose_option variable from these options: 'AdamW', 'AdaFactor', 'Adam', 'DeepSpeedCPUAdam'")
+    def common_step(
+            self,
+            realization: torch.Tensor,
+            realization_idx: any,
+            stage: str) -> dict:
+        if isinstance(realization, list):
+            # class labels
+            y_c = realization[1]#.long()
+            # input samples
+            realization = realization[0]
+            batch_size, seq_length = realization.size()
+        realization = realization.reshape(batch_size, 1, seq_length).long()
+        train_tuple = self.cond_elbo_objective(
+                realization=realization,
+                y_c=y_c,
+                realization_idx=realization_idx,
+                stage=stage,
+                ema=True if 'ema' in stage.lower() else False,
+        )
+        if len(train_tuple) == 1:
+            loss = train_tuple[0]
+        else:
+            loss = train_tuple[0]
+            metrics = train_tuple[1]
+        if realization_idx == 0:
+            gpu_memory_usage = helper_tools.print_gpu_initialization()
+            self.log(f"{stage}_gpu_memory_usage", gpu_memory_usage, sync_dist=True)
+        sync_dist = True if 'val' in stage else False
+        # track loss
+        self.log(f"{stage}_loss", loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=sync_dist)
+        # track performance metrics
+        if len(train_tuple) > 1:
+            self.log(f"{stage}_prev_hard_acc", metrics[0], prog_bar=True,  on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_prev_soft_acc", metrics[1], on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_fut_hard_acc", metrics[2], prog_bar=True, on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_fut_soft_acc", metrics[3], on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_current_hard_acc", metrics[4], prog_bar=True, on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_current_soft_acc", metrics[5], on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_current_ppl", metrics[6], on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_prev_ppl", metrics[7], on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_fut_ppl", metrics[8], on_step=True, on_epoch=True, sync_dist=sync_dist)
+            self.log(f"{stage}_pos_entropy", metrics[9], on_step=True, on_epoch=True, sync_dist=sync_dist)
+        torch.cuda.empty_cache()
+        return {'loss': loss}
+    def training_step(
+            self,
+            realization: torch.Tensor,
+            realization_idx: any):
+        return self.common_step(realization, realization_idx, stage='train')
+    def validation_step(
+            self,
+            realization: torch.Tensor,
+            realization_idx: any):
+        self.common_step(realization, realization_idx, stage='val')
+        #self.common_step(realization, realization_idx, stage='EMA_val')
+    def apply_OneHotCat(self, probs: torch.Tensor) -> any:
+        return OneHotCategorical(probs=probs.permute(0,2,1))
+        #return OneHotCategorical(probs=F.softmax(probs.permute(0,2,1), dim=-1))
+    def cond_elbo_objective(
+            self,
+            realization: torch.Tensor,
+            y_c: torch.Tensor,
+            realization_idx: any,
+            stage: str,
+            ema=False,
+        ):
+            bs, channel, seq_length = realization.size()
+            # get a batch of random sampling paths
+            sampled_random_path = trainer_tools.sample_random_path(bs, seq_length, device=self.script_args.device)
+            # sample a set of random smapling steps for each individual training sequences in the current batch
+            idx = trainer_tools.sample_random_index_for_sampling(bs, seq_length, device=self.script_args.device, option='random')
+            # we create a mask that masks the location were we've already sampled
+            random_path_mask = trainer_tools.create_mask_at_random_path_index(sampled_random_path, idx, bs, seq_length)
+            # create a mask that masks the location where we are currently sampling
+            current_path_mask = trainer_tools.create_sampling_location_mask(sampled_random_path, idx, bs, seq_length)
+            # future sampling locations (i.e. >t)
+            future_path_mask = trainer_tools.create_mask_at_future_path_index(sampled_random_path, idx, bs, seq_length)
+            # tokenize realization
+            real_tokens, bs, seq_length = trainer_tools.create_token_labels(self.script_args, realization)
+            #real_tokens = realization.clone().squeeze(1)
+            # mask realizations
+            real_token_masked = trainer_tools.mask_realizations(real_tokens, random_path_mask)
+            # conditional probs
+            #probs = self(x=real_token_masked, t=idx, y_c=y_c, ema=ema)
+            logits = self(x=real_token_masked, t=idx, y_c=y_c, ema=ema)
+            conditional_prob = OneHotCategorical(logits=logits.permute(0,2,1))
+            #conditional_prob = self.apply_OneHotCat(probs=probs)
+            # evaluate the value of the log prob for the given realization
+            log_prob = trainer_tools.log_prob_of_realization(self.script_args, conditional_prob, real_tokens)
+            # compute an average over all the unsampled
+            #log_prob_unsampled = trainer_tools.log_prob_of_unsampled_locations(log_prob.to(self.script_args.device), real_token_masked.to(self.script_args.device))
+            log_prob_unsampled = trainer_tools.log_prob_of_unsampled_locations(log_prob, real_token_masked)
+            #log_prob_unsampled = trainer_tools.log_prob_of_unsampled_locations(log_prob, real_token_masked, real_tokens)
+            # compute an average loss i.e. negative average log-likelihood over the batch elements
+            log_prob_weighted = trainer_tools.weight_log_prob(log_prob_unsampled, idx, seq_length)
+            # compute an average loss i.e. negative average log-likelihood over the batch elements
+            loss = trainer_tools.compute_average_loss_for_batch(log_prob_weighted)
+            #if 'val' in stage:
+            probs = F.softmax(logits, dim=1)
+            metrics = self.performance_step(
+                        real_tokens=real_tokens.cpu(),
+                        idx=idx.cpu(),
+                        sampled_random_path=sampled_random_path.cpu().float(),
+                        probs=probs.cpu().float(),
+                        conditional_prob=conditional_prob)
+            return loss, metrics
+           # return loss,
+    @torch.no_grad()
+    def performance_step(
+                    self,
+                    real_tokens: torch.Tensor,
+                    idx: torch.Tensor,
+                    sampled_random_path: torch.Tensor,
+                    probs: torch.Tensor,
+                    conditional_prob: torch.Tensor
+                    ) -> tuple:
+        # create numerical token sequence
+        sample_seq = torch.argmax(trainer_tools.sample_from_conditional(conditional_prob).cpu(), dim=1)
+        # eval prev positions in terms of time
+        prev_B_hard_acc, prev_B_soft_acc, fut_B_hard_acc, fut_B_soft_acc, current_B_hard_acc, current_B_soft_acc = eval_funcs.compute_acc_given_time_pos(
+                real_tokens=real_tokens,
+                sample_seq=sample_seq,
+                sample_path=sampled_random_path,
+                idx=idx
+        )
+        # compute ppl given time position
+        current_ppl, prev_ppl, fut_ppl = eval_funcs.compute_ppl_given_time_pos(
+                probs=probs,
+                sample_path=sampled_random_path,
+                idx=idx
+        )
+        # average positional entropy
+        pos_entropy = trainer_tools.compute_pos_entropy(probs=probs).mean().item()
+        metric_evals = (
+                prev_B_hard_acc,
+                prev_B_soft_acc,
+                fut_B_hard_acc,
+                fut_B_soft_acc,
+                current_B_hard_acc,
+                current_B_soft_acc,
+                current_ppl,
+                prev_ppl,
+                fut_ppl,
+                pos_entropy
+        )
+        return metric_evals
+class PFamDataModule(pl.LightningDataModule):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        #df = pd.read_csv(args.data_root)
+        #data = torch.load(args.data_root)
+        data = self.load_data()
+        num_seq_list, text_emb_list = prep.prepare_protein_data(
+                args=args,
+                data_dict=data
+        )
+        print('Performing 80/20 random train/val split')
+        num_seq_list_train, num_seq_list_val, text_emb_train, text_emb_val = train_test_split(num_seq_list,
+                                                                                            text_emb_list,
+                                                                                            test_size=args.valid_size,
+                                                                                            #stratify=class_label_list,
+                                                                                            random_state=args.seed)
+        print(f'Number of training samples: {len(num_seq_list_train)}')
+        print(f'Number of validation samples: {len(num_seq_list_val)}')
+        self.train_dataset = prep.protein_dataset(
+            num_seq_list=num_seq_list_train,
+            text_emb=text_emb_train
+        )
+        self.val_dataset = prep.protein_dataset(
+            num_seq_list=num_seq_list_val,
+            text_emb=text_emb_val
+        )
+    def load_data(self):
+        try:
+            print(self.args.swissprot_data_root, self.args.pfam_data_root)
+            if self.args.swissprot_data_root != "None":
+                swissprot_data = torch.load(self.args.swissprot_data_root)
+            else:
+                swissprot_data=None
+            if self.args.pfam_data_root != "None":
+                pfam_data = torch.load(self.args.pfam_data_root)
+            else:
+                pfam_data=None
+            if (self.args.swissprot_data_root != "None") and (self.args.pfam_data_root != "None"):
+                return self.merge_and_append_values(dict1=swissprot_data, dict2=pfam_data)
+            elif self.args.swissprot_data_root == "None":
+                return pfam_data
+            elif self.args.pfam_data_root == "None":
+                return swissprot_data
+            else:
+                raise ValueError('Both SwissProt and Pfam datasets are unavailable.')
+        except FileNotFoundError as e:
+            raise FileNotFoundError(f"Data file not found: {e}")
+    def merge_and_append_values(self, dict1, dict2):
+        merged_dict = {}
+        # Combine all keys from both dictionaries
+        all_keys = set(dict1) | set(dict2)
+        for key in all_keys:
+            values = []
+            if key in dict1:
+                values.append(dict1[key])
+            if key in dict2:
+                values.append(dict2[key])
+            # Merge values for each key
+            # This merges lists or appends non-list values
+            merged_dict[key] = [item for sublist in values for item in (sublist if isinstance(sublist, list) else [sublist])]
+        return merged_dict
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.args.batch_size,
+            num_workers=self.args.num_workers,
+            shuffle=True
+    )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.args.batch_size,
+            num_workers=self.args.num_workers,
+            shuffle=False
+    )

Stage3_source/__init__.py ADDED Viewed

File without changes

Stage3_source/animation_tools.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import textwrap
+from PIL import Image, ImageDraw, ImageFont
+import imageio
+import os
+# convert the numerical labels tok characters
+def convert_num_to_char(
+        tokens: list,
+        char_tokens: any
+    ) -> str:
+    return "".join([tokens[num] for num in char_tokens.tolist()])
+# draw text onto white page
+def draw_text(
+        image: any,
+        text: any,
+        font: any,
+        position: tuple=(0,0),
+        max_width: any=None,
+        fill: tuple=(0,0,0)
+    ) -> None:
+    draw = ImageDraw.Draw(image)
+    if max_width:
+        wrapped_text = textwrap.fill(text, width=max_width)
+    else:
+        wrapped_text = text
+    draw.multiline_text(position, wrapped_text, font=font, fill=fill)
+# create gif animation
+def generate_text_animation(
+        text_list: list,
+        text_animation_path: str,
+        output_temp_path: str='./outputs/temp_files'
+    ) -> None:
+    # create images with text
+    image_files = []
+    for index, text in enumerate(text_list):
+        img = Image.new('RGB', (600, 159), color=(255, 255, 255))  # Create a white image
+        font = ImageFont.load_default()
+        draw_text(img, text, font, position=(10, 10), max_width=80, fill=(0, 0, 0))
+        # Save image to a temporary file
+        os.makedirs(output_temp_path, exist_ok=True)
+        # temp_file = f'./outputs/temp_image_{index}.png'
+        temp_file = output_temp_path + f'/temp_image_{index}.png'
+        img.save(temp_file)
+        image_files.append(temp_file)
+    # Read saved images and create a GIF
+    images = [imageio.imread(file) for file in image_files]
+    imageio.mimsave(
+            text_animation_path,
+            images,
+            format='GIF',
+            duration=0.2,
+    )
+    # clean up temp image files
+    for file in image_files:
+        os.remove(file)
+    return

Stage3_source/cond_diff_transformer_layer.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from axial_positional_embedding import AxialPositionalEmbedding
+from linear_attention_transformer import LinearAttentionTransformer
+#Adapted from ehoogeboom github repo ...
+class SinusoidalPosEmb(nn.Module):
+    """
+    Time embeddings
+    """
+    def __init__(
+            self,
+            dim,
+            num_steps,
+            rescale_steps=4000
+        ):
+        super().__init__()
+        self.dim = dim
+        self.num_steps = float(num_steps)
+        self.rescale_steps = float(rescale_steps)
+    def forward(
+            self,
+            x
+        ):
+        x = x/self.num_steps * self.rescale_steps
+        device=x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:,None] * emb[None,:]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class LinearAttentionTransformerEmbedding(nn.Module):
+    def __init__(
+            self,
+            args,
+            input_dim,
+            output_dim,
+            dim,
+            depth,
+            n_blocks,
+            max_seq_len,
+            num_timesteps,
+            heads=8,
+            dim_head=None,
+            causal=False,
+            reversible=False,
+            ff_chunks=1,
+            ff_glu=False,
+            ff_dropout=0.,
+            attn_layer_dropout=0.,
+            attn_dropout=0.,
+            blindspot_size=1,
+            n_local_attn_heads=0,
+            local_attn_window_size=128,
+            return_embeddings=False,
+            recieves_context=False,
+            pkm_layers=tuple(),
+            pkm_num_keys=128,
+            attend_axially=False,
+            linformer_settings=None,
+            context_linformer_settings=None
+    ):
+        assert (max_seq_len % local_attn_window_size) == 0, 'max sequence length must be divisible by the window size, to calculate number of kmeans cluster'
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.depth = depth
+        self.emb_dim = dim
+        self.n_blocks = n_blocks
+        # token embeddings
+        self.x_emb_NN = nn.Embedding(input_dim, self.emb_dim)
+        # class label embedding
+        #self.class_emb_NN = nn.Embedding(args.num_y_class_labels, self.emb_dim)
+        self.y_mlp = nn.Sequential(
+                nn.Linear(args.text_emb_dim, self.emb_dim*4),
+                nn.Softplus(),
+                nn.Linear(self.emb_dim*4, self.emb_dim*n_blocks*depth)
+        )
+        # time embeddings
+        self.time_pos_emb = SinusoidalPosEmb(self.emb_dim, num_timesteps)
+        self.mlp = nn.Sequential(
+                nn.Linear(self.emb_dim, self.emb_dim*4),
+                nn.Softplus(),
+                nn.Linear(self.emb_dim*4, self.emb_dim*n_blocks*depth)
+        )
+        # token positional embeddings
+        self.axial_pos_emb = AxialPositionalEmbedding(
+                dim = self.emb_dim,
+                axial_shape=(
+                         max_seq_len // local_attn_window_size,
+                         local_attn_window_size)
+        )
+        self.transformer_blocks = torch.nn.ModuleList()
+        for ii in range(n_blocks):
+            self.transformer_blocks.append(torch.nn.ModuleList())
+            for jj in range(depth):
+                self.transformer_blocks[-1].append(
+                        LinearAttentionTransformer(
+                            self.emb_dim,
+                            1,
+                            max_seq_len,
+                            heads=heads,
+                            dim_head=dim_head,
+                            causal=causal,
+                            ff_chunks=ff_chunks,
+                            ff_glu=ff_glu,
+                            ff_dropout=ff_dropout,
+                            attn_layer_dropout=attn_layer_dropout,
+                            reversible=reversible,
+                            blindspot_size=blindspot_size,
+                            n_local_attn_heads=n_local_attn_heads,
+                            local_attn_window_size=local_attn_window_size,
+                            attend_axially=attend_axially,
+                            linformer_settings=linformer_settings,
+                            context_linformer_settings=context_linformer_settings
+                        )
+                )
+        self.norm = nn.LayerNorm(dim)
+        self.out = nn.Linear(self.emb_dim, output_dim) if not return_embeddings else nn.Identity()
+    def forward(self, x, t, y_c, **kwargs):
+        # time embeddings
+        t = self.time_pos_emb(t).type([p.dtype for p in self.mlp.parameters()][0])
+        t = self.mlp(t)
+        time_embed = t.reshape(x.size(0), 1, self.emb_dim, self.n_blocks, self.depth)
+        # token embeddings
+        x = self.x_emb_NN(x.long()) # final shape (batch_size, timelength, model_emb_dim)
+        # positional embeddings
+        x_pos = self.axial_pos_emb(x).type(x.type())
+        x_embed_axial = x + x_pos
+        h = torch.zeros_like(x_embed_axial)
+        # z_t embedding
+        #y_emb = self.class_emb_NN(y_c)
+        y_emb = self.y_mlp(y_c)
+        y_emb = y_emb.reshape(x.size(0), 1, self.emb_dim, self.n_blocks, self.depth)
+        for i, block in enumerate(self.transformer_blocks):
+            h = h+x_embed_axial
+            for j, transformer in enumerate(block):
+                h = transformer(h + time_embed[...,i,j] + y_emb[...,i,j])
+        h = self.norm(h)
+        output = self.out(h)
+        return output.permute(0,2,1)
+def add_model_args(parser):
+    # Flow params
+    parser.add_argument('--num_steps', type=int, default=1)
+    parser.add_argument('--actnorm', type=eval, default=False)
+    parser.add_argument('--perm_channel', type=str, default='none', choices={'conv', 'shuffle', 'none'})
+    parser.add_argument('--perm_length', type=str, default='reverse', choices={'reverse', 'none'})
+    parser.add_argument('--input_dp_rate', type=float, default=0.0)
+    # Transformer params.
+    parser.add_argument('--transformer_dim', type=int, default=512)
+    parser.add_argument('--transformer_heads', type=int, default=16)
+    parser.add_argument('--transformer_depth', type=int, default=16)
+    parser.add_argument('--transformer_blocks', type=int, default=1)
+    parser.add_argument('--transformer_dropout', type=float, default=0.1)
+    parser.add_argument('--transformer_reversible', type=eval, default=False)
+    parser.add_argument('--transformer_local_heads', type=int, default=8)
+    parser.add_argument('--transformer_local_size', type=int, default=128)
+def get_model(args, data_shape, num_classes):
+    data_shape = data_shape
+    num_classes = num_classes
+    input_dp_rate = args.input_dp_rate
+    transformer_dim = args.transformer_dim
+    transformer_heads = args.transformer_heads
+    transformer_depth = args.transformer_depth
+    transformer_blocks = args.transformer_blocks
+    transformer_local_heads = args.transformer_local_heads
+    transformer_local_size = args.transformer_local_size
+    transformer_reversible = args.transformer_reversible
+    diffusion_steps = args.diffusion_steps
+    C, _ = num_classes, data_shape[0]*data_shape[1]
+    L = args.diffusion_steps
+    print('Data shape index 0:', L)
+    current_shape = (L,)
+    class DiffTransformer(nn.Module):
+        def __init__(self,):
+            super(DiffTransformer, self).__init__()
+            self.transformer = LinearAttentionTransformerEmbedding(
+                    args=args,
+                    input_dim=num_classes,
+                    output_dim=num_classes,
+                    dim=transformer_dim,
+                    heads=transformer_heads,
+                    depth=transformer_depth,
+                    n_blocks=transformer_blocks,
+                    max_seq_len=L,
+                    num_timesteps=diffusion_steps,
+                    causal=False, # no autoregression
+                    ff_dropout=0, # dropout for feedforward NN
+                    attn_layer_dropout=input_dp_rate, # dropout right after self-att layer
+                    attn_dropout=0, # dropout post-attention
+                    n_local_attn_heads=transformer_local_heads,
+                    # number of local attention heads for (QK)*V attention.
+                    # this can be a tuple specifying the exact number of local
+                    # attention heads at that depth
+                    local_attn_window_size=transformer_local_size,
+                    # receptive field of the local attention
+                    reversible=transformer_reversible,
+                    # use reversible nets, from reformer paper
+            )
+        def forward(self, x, t, y_c):
+            x = self.transformer(x,t,y_c)
+            return x
+    model = DiffTransformer()
+    return model

Stage3_source/diff_transformer_layer.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from axial_positional_embedding import AxialPositionalEmbedding
+from linear_attention_transformer import LinearAttentionTransformer
+#Adapted from ehoogeboom github repo ...
+class SinusoidalPosEmb(nn.Module):
+    """
+    Time embeddings
+    """
+    def __init__(
+            self,
+            dim,
+            num_steps,
+            rescale_steps=4000
+        ):
+        super().__init__()
+        self.dim = dim
+        self.num_steps = float(num_steps)
+        self.rescale_steps = float(rescale_steps)
+    def forward(
+            self,
+            x
+        ):
+        x = x/self.num_steps * self.rescale_steps
+        device=x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:,None] * emb[None,:]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class LinearAttentionTransformerEmbedding(nn.Module):
+    def __init__(
+            self,
+            input_dim,
+            output_dim,
+            dim,
+            depth,
+            n_blocks,
+            max_seq_len,
+            num_timesteps,
+            heads=8,
+            dim_head=None,
+            causal=False,
+            reversible=False,
+            ff_chunks=1,
+            ff_glu=False,
+            ff_dropout=0.,
+            attn_layer_dropout=0.,
+            attn_dropout=0.,
+            blindspot_size=1,
+            n_local_attn_heads=0,
+            local_attn_window_size=128,
+            return_embeddings=False,
+            recieves_context=False,
+            pkm_layers=tuple(),
+            pkm_num_keys=128,
+            attend_axially=False,
+            linformer_settings=None,
+            context_linformer_settings=None
+    ):
+        assert (max_seq_len % local_attn_window_size) == 0, 'max sequence length must be divisible by the window size, to calculate number of kmeans cluster'
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.depth = depth
+        self.emb_dim = dim
+        self.n_blocks = n_blocks
+        print('Input dimension', input_dim)
+        print('Output dimension', output_dim)
+        # token embeddings
+        self.x_emb_NN = nn.Embedding(input_dim, self.emb_dim)
+        # time embeddings
+        self.time_pos_emb = SinusoidalPosEmb(self.emb_dim, num_timesteps)
+        self.mlp = nn.Sequential(
+                nn.Linear(self.emb_dim, self.emb_dim*4),
+                nn.Softplus(),
+                nn.Linear(self.emb_dim*4, self.emb_dim*n_blocks*depth)
+        )
+        # token positional embeddings
+        self.axial_pos_emb = AxialPositionalEmbedding(
+                dim = self.emb_dim,
+                axial_shape=(
+                         max_seq_len // local_attn_window_size,
+                         local_attn_window_size)
+        )
+        self.pos_emb = nn.Embedding(1, self.emb_dim)
+        self.transformer_blocks = torch.nn.ModuleList()
+        for ii in range(n_blocks):
+            self.transformer_blocks.append(torch.nn.ModuleList())
+            for jj in range(depth):
+                self.transformer_blocks[-1].append(
+                        LinearAttentionTransformer(
+                            self.emb_dim,
+                            1,
+                            max_seq_len,
+                            heads=heads,
+                            dim_head=dim_head,
+                            causal=causal,
+                            ff_chunks=ff_chunks,
+                            ff_glu=ff_glu,
+                            ff_dropout=ff_dropout,
+                            attn_layer_dropout=attn_layer_dropout,
+                            reversible=reversible,
+                            blindspot_size=blindspot_size,
+                            n_local_attn_heads=n_local_attn_heads,
+                            local_attn_window_size=local_attn_window_size,
+                            attend_axially=attend_axially,
+                            linformer_settings=linformer_settings,
+                            context_linformer_settings=context_linformer_settings
+                        )
+                )
+        self.norm = nn.LayerNorm(dim)
+        self.out = nn.Linear(self.emb_dim, output_dim) if not return_embeddings else nn.Identity()
+#        self.out = nn.Conv1d(self.emb_dim, output_dim, kernel_size=1,stride=1)
+    def forward(self, x, t, **kwargs):
+        t = self.time_pos_emb(t)
+        t = self.mlp(t)
+        time_embed = t.reshape(x.size(0), 1, self.emb_dim, self.n_blocks, self.depth)
+        x = self.x_emb_NN(x.long()) # final shape (batch_size, timelength, model_emb_dim)
+        x_pos = self.axial_pos_emb(x).type(x.type())
+        # x_pos = self.pos_emb( self._create_pos_vec(x=x)).type(x.type())
+        x_embed_axial = x + x_pos
+        h = torch.zeros_like(x_embed_axial)
+        for i, block in enumerate(self.transformer_blocks):
+            h = h+x_embed_axial
+            for j, transformer in enumerate(block):
+                h = transformer(h+time_embed[...,i,j])
+        h = self.norm(h)
+        output = self.out(h)
+        return output.permute(0,2,1)
+class Rezero(nn.Module):
+    def __init__(self):
+        super(Rezero, self).__init__()
+        self.alpha = torch.nn.Parameter(torch.zeros(size=(1,)))
+    def forward(self,x):
+        return self.alpha * x
+def add_model_args(parser):
+    # Flow params
+    parser.add_argument('--num_steps', type=int, default=1)
+    parser.add_argument('--actnorm', type=eval, default=False)
+    parser.add_argument('--perm_channel', type=str, default='none', choices={'conv', 'shuffle', 'none'})
+    parser.add_argument('--perm_length', type=str, default='reverse', choices={'reverse', 'none'})
+    parser.add_argument('--input_dp_rate', type=float, default=0.0)
+    # Transformer params.
+    parser.add_argument('--transformer_dim', type=int, default=512)
+    parser.add_argument('--transformer_heads', type=int, default=16)
+    parser.add_argument('--transformer_depth', type=int, default=16)
+    parser.add_argument('--transformer_blocks', type=int, default=1)
+    parser.add_argument('--transformer_dropout', type=float, default=0.1)
+    parser.add_argument('--transformer_reversible', type=eval, default=False)
+    parser.add_argument('--transformer_local_heads', type=int, default=8)
+    parser.add_argument('--transformer_local_size', type=int, default=128)
+def get_model(args, data_shape, num_classes):
+    data_shape = data_shape
+    num_classes = num_classes
+    input_dp_rate = args.input_dp_rate
+    transformer_dim = args.transformer_dim
+    transformer_heads = args.transformer_heads
+    transformer_depth = args.transformer_depth
+    transformer_blocks = args.transformer_blocks
+    transformer_local_heads = args.transformer_local_heads
+    transformer_local_size = args.transformer_local_size
+    transformer_reversible = args.transformer_reversible
+    diffusion_steps = args.diffusion_steps
+    C, L = num_classes, data_shape[0]*data_shape[1]
+    print('Data shape index 0:', L)
+    current_shape = (L,)
+    class DiffTransformer(nn.Module):
+        def __init__(self,):
+            super(DiffTransformer, self).__init__()
+            self.transformer = LinearAttentionTransformerEmbedding(
+                    input_dim=num_classes,
+                    output_dim=num_classes,
+                    dim=transformer_dim,
+                    heads=transformer_heads,
+                    depth=transformer_depth,
+                    n_blocks=transformer_blocks,
+                    max_seq_len=L,
+                    num_timesteps=diffusion_steps,
+                    causal=False, # no autoregression
+                    ff_dropout=0, # dropout for feedforward NN
+                    attn_layer_dropout=input_dp_rate, # dropout right after self-att layer
+                    attn_dropout=0, # dropout post-attention
+                    n_local_attn_heads=transformer_local_heads,
+                    # number of local attention heads for (QK)*V attention.
+                    # this can be a tuple specifying the exact number of local
+                    # attention heads at that depth
+                    local_attn_window_size=transformer_local_size,
+                    # receptive field of the local attention
+                    reversible=transformer_reversible,
+                    # use reversible nets, from reformer paper
+            )
+            self.rezero = Rezero()
+        def forward(self, x, t):
+            x = self.transformer(x,t)
+    #       x = x.permute(0,2,1)
+ #           x = self.rezero(x)
+            return x
+    model = DiffTransformer()
+    return model

Stage3_source/eval_metrics.py ADDED Viewed

	@@ -0,0 +1,412 @@

+"""
+description:
+    metrics to compute model performance
+"""
+import Bio
+from Bio.Align import substitution_matrices
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import re
+import Stage3_source.animation_tools as ani_tools
+' compute Blosum62 soft accuracy '
+class blosum_soft_accuracy:
+    def __init__(self, ):
+        self.blosum62 = substitution_matrices.load("BLOSUM62")
+        self.alphabet = self.blosum62.alphabet
+    def blosum_acc(
+            self,
+            aa1: str,
+            aa2: str
+        ) -> np.single:
+        row = self.blosum62.alphabet.index(aa1)
+        col = self.blosum62.alphabet.index(aa2)
+        substitution_scores = self.blosum62[row, :].values()
+        # Apply the softmax function to the substitution scores to get a prob dist.
+        probs = np.exp(substitution_scores)/np.sum(np.exp(substitution_scores))
+        # compute the soft acc. as the dot product of the prob dist. with a one-hot encoding
+        # of the amino acid ...
+        correct_aa = aa2
+        correct_index = self.alphabet.index(correct_aa)
+        one_hot = np.zeros_like(probs)
+        one_hot[correct_index] = 1
+        # normalize acc.
+        soft_acc = np.dot(probs, one_hot) / np.max(probs)
+        return soft_acc
+    def split_seq(self, seq: str) ->list:
+      #  no_pads = seq.count("<PAD>")
+      #  split_seq = ["<START>"] + list(seq.replace("<START>","").replace("<END>","").replace("<PAD>","")) + ["<END>"] + ["<PAD>"] * no_pads
+        split_seq = re.split(r'(-|<START>|<END>|<PAD>|(?<=\w)(?=\w))', seq)
+        #split_seq = re.findall(r'<START>|<END>|<PAD>|[A-Z]|-|\*', seq)
+        # remove empty strings and whitespace-only elements
+        split_seq = [char for char in split_seq if char and char.strip()]
+        return split_seq
+    def compute_soft_accuracy(
+            self,
+            seq1_list: list,
+            seq2_list: list
+        ) -> float:
+        # make sure batch size matches
+        if len(seq1_list) == len(seq2_list):
+            self.batch_size = len(seq1_list)
+        else:
+            print("Please make sequence batch size equivalent...")
+        # make sure sequence length matches
+        if len(seq1_list[0]) == len(seq2_list[0]):
+            self.L = len(seq1_list[0])
+        else:
+            #print("Please make sequence length match...")
+            pass
+        avg_soft_acc_per_batch = 0
+        # loop over the batch of sequence
+        for seq1, seq2 in zip(seq1_list, seq2_list):
+            # split sequence into individual tokens
+            seq1 = self.split_seq(seq1)
+            seq2 = self.split_seq(seq2)
+            # set number of positions
+            self.L = len(seq2)
+            self.L_h = 0
+            self.L_s = 0
+            avg_soft_acc_per_seq = 0
+            avg_hard_acc_per_seq = 0
+            # loop over the amino acid positions
+            for aa1, aa2 in zip(seq1, seq2):
+                if (aa1 not in ['-', '<START>', '<END>', '<PAD>']) and (aa2 not in ['-', '<START>', '<END>', '<PAD>']):
+                    self.L_s += 1
+                    soft_acc = self.blosum_acc(aa1=aa1, aa2=aa2)
+                    avg_soft_acc_per_seq += soft_acc
+                else:
+                    self.L_h += 1
+                    acc = 1*(aa1==aa2)
+                    avg_hard_acc_per_seq += acc
+            # compute accuracy for soft positions
+            try:
+                avg_soft_acc_per_seq *= 1/self.L_s
+            except ZeroDivisionError:
+                #print("L_s cannot be zero. Setting avg_soft_acc_per_seq to zero.")
+                avg_soft_acc_per_seq = 0
+            # compute accuracy for hard positions
+            try:
+                avg_hard_acc_per_seq *= 1/self.L_h
+            except ZeroDivisionError:
+                #print("L_h cannot be zero. Setting avg_hard_acc_per_seq to zero.")
+                avg_hard_acc_per_seq = 0
+            # compute the average accuracy between soft and hard
+            if self.L_s == 0:
+                avg_soft_acc_per_batch += avg_hard_acc_per_seq
+            elif self.L_h == 0:
+                avg_soft_acc_per_batch += avg_soft_acc_per_seq
+            else:
+                avg_soft_acc_per_batch += (avg_soft_acc_per_seq + avg_hard_acc_per_seq)/2
+        avg_soft_acc_per_batch *= 1/self.batch_size
+        return avg_soft_acc_per_batch
+def compute_ppl(probs: torch.Tensor) -> float:
+    batch_size, sequence_length, class_labels = probs.shape
+    # flatten batch and sequence dimensions into a single dimension
+    flattened_probs = probs.reshape(batch_size * sequence_length, class_labels)
+    # calc. perplexity for each sequence independently
+    ppl = []
+    for i in range(batch_size * sequence_length):
+        sequence_probs = flattened_probs[i]
+        # compute ppl per seq
+        sequence_ppl = torch.exp(-torch.sum(
+            sequence_probs * torch.log(sequence_probs)
+            )
+        )
+        ppl.append(sequence_ppl.item())
+    ppl = torch.tensor(ppl).view(batch_size, sequence_length) # ppl per sequence in a given batch
+    avg_ppl = ppl.mean().item() # average ppl per batch
+    return avg_ppl
+def batch_compute_ppl(probs_list: list) -> float:
+    batch_prob = sum([
+        compute_ppl(probs=probs.unsqueeze(0).permute(0,2,1)) for probs in probs_list
+    ]) / len(probs_list)
+    return batch_prob
+def compute_hard_acc(
+        seq1: str,
+        seq2: str
+    ) -> float:
+    hard_acc = sum([aa1 == aa2 for (aa1 ,aa2) in zip(seq1, seq2) if aa2 != '<PAD>'])
+    valid_length = len([aa2 for aa2 in seq2 if aa2 != '<PAD>'])
+    if valid_length == 0:
+        return 1.0
+    hard_acc /= valid_length
+    return hard_acc
+#def compute_hard_acc(
+#        seq1: str,
+#        seq2: str
+#    ) -> float:
+#
+#    hard_acc = sum([aa1 == aa2 for (aa1 ,aa2) in zip(seq1, seq2)])
+#    hard_acc *= 1/len(seq2)
+#   return hard_acc
+def batch_hard_acc(seq1_list: list, seq2_list: list) -> float:
+    hard_acc = sum([
+        compute_hard_acc(seq1=seq1, seq2=seq2) for (seq1,seq2) in zip(seq1_list, seq2_list)
+    ]) / len(seq2_list)
+    return hard_acc
+def time_split_on_seq(
+    seq: torch.Tensor,
+    sample_seq_path: torch.Tensor,
+    idx: torch.Tensor
+    ) -> (
+        list,
+        list,
+        list
+    ):
+    if len(seq.shape) != 2:
+        batch_size, class_labels, _ = seq.shape
+        # collect list
+        current_seq, prev_seq, fut_seq = [], [], []
+        for ii in range(batch_size):
+            current_stack_probs, prev_stack_probs, fut_stack_probs = [], [], []
+            for jj in range(class_labels):
+                # current probs
+                current_stack_probs.append(
+                    seq[ii,jj][
+                        (sample_seq_path.cpu()[ii] == idx.cpu()[ii])
+                    ]
+                )
+                # prev probs
+                prev_stack_probs.append(
+                    seq[ii,jj][
+                        (sample_seq_path.cpu()[ii] < idx.cpu()[ii])
+                    ]
+                )
+                # future probs
+                fut_stack_probs.append(
+                    seq[ii,jj][
+                        (sample_seq_path.cpu()[ii] > idx.cpu()[ii])
+                    ]
+                )
+            current_seq.append(torch.stack(current_stack_probs))
+            prev_seq.append(torch.stack(prev_stack_probs))
+            fut_seq.append(torch.stack(fut_stack_probs))
+    else:
+        # split the sequences based on time indices
+        current_seq = [seq[ii][sample_seq_path[ii] == idx[ii]] for ii in range(seq.shape[0])]
+        prev_seq = [seq[ii][sample_seq_path[ii] < idx[ii]] for ii in range(seq.shape[0])]
+        fut_seq = [seq[ii][sample_seq_path[ii] > idx[ii]] for ii in range(seq.shape[0])]
+    return (
+        current_seq,
+        prev_seq,
+        fut_seq
+    )
+@torch.no_grad()
+def compute_acc_given_time_pos(
+	real_tokens: torch.Tensor,
+	sample_seq: torch.Tensor,
+	sample_path: torch.Tensor,
+	idx: torch.Tensor
+	) -> (
+	float,
+	float,
+	float,
+	float,
+	float,
+	float
+	):
+    # tokenizer
+    tokens = ['-', '<START>', 'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','<END>','<PAD>']
+    #tokens = ['<START>', 'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','<END>','<PAD>']
+    tokens = tokens + ['X', 'U', 'Z', 'B', 'O']
+    # split real tokens based on time indices
+    current_real_tokens, prev_real_tokens, fut_real_tokens = time_split_on_seq(
+    	seq=real_tokens.cpu(),
+    	sample_seq_path=sample_path.cpu(),
+    	idx=idx.cpu()
+    )
+    # split sampled tokens based on time indices
+    current_sample_tokens, prev_sample_tokens, fut_sample_tokens = time_split_on_seq(
+    	seq=sample_seq.cpu(),
+    	sample_seq_path=sample_path.cpu(),
+    	idx=idx.cpu()
+    )
+    # convert real sequences to characters
+    current_real_chars = [ani_tools.convert_num_to_char(tokens,seq_tokens) for seq_tokens in current_real_tokens]
+    prev_real_chars = [ani_tools.convert_num_to_char(tokens,seq_tokens) for seq_tokens in prev_real_tokens]
+    fut_real_chars = [ani_tools.convert_num_to_char(tokens,seq_tokens) for seq_tokens in fut_real_tokens]
+    # convert sample sequences to characters
+    current_sample_chars = [ani_tools.convert_num_to_char(tokens,seq_tokens) for seq_tokens in current_sample_tokens]
+    prev_sample_chars = [ani_tools.convert_num_to_char(tokens,seq_tokens) for seq_tokens in prev_sample_tokens]
+    fut_sample_chars = [ani_tools.convert_num_to_char(tokens,seq_tokens) for seq_tokens in fut_sample_tokens]
+    # drop empty entries in list (happens if t=0 or t=256)
+    # prev string sequences
+    prev_sample_chars = [item for item in prev_sample_chars if item]
+    prev_real_chars = [item for item in prev_real_chars if item]
+    # fut string sequences
+    fut_real_chars = [item for item in fut_real_chars if item]
+    fut_sample_chars = [item for item in fut_sample_chars if item]
+    # class object to copmute blosum62 soft acc.
+    soft_acc_tool = blosum_soft_accuracy()
+    # split real sequence
+    prev_real_split_chars = [
+        soft_acc_tool.split_seq(sample) for sample in prev_real_chars
+    ]
+    fut_real_split_chars = [
+        soft_acc_tool.split_seq(sample) for sample in fut_real_chars
+    ]
+    # split sample sequence
+    prev_sample_split_chars = [
+        soft_acc_tool.split_seq(sample) for sample in prev_sample_chars
+    ]
+    fut_sample_split_chars = [
+        soft_acc_tool.split_seq(sample) for sample in fut_sample_chars
+    ]
+    # compute hard and soft accuracy
+    ' soft accuracy: '
+    # positions < t ( aa positions)
+    #prev_batch_soft_acc = soft_acc_tool.compute_soft_accuracy(
+    #    seq1_list=prev_sample_chars,
+    #    seq2_list=prev_real_chars
+    #)
+    # positions > t ( aa positions)
+    #fut_batch_soft_acc = soft_acc_tool.compute_soft_accuracy(
+    #    seq1_list=fut_sample_chars,
+    #    seq2_list=fut_real_chars
+    #)
+    # positions = t (aa positions)
+    #current_soft_acc = soft_acc_tool.compute_soft_accuracy(
+    #seq1_list=current_sample_chars,
+    #seq2_list=current_real_chars
+    #)
+    prev_batch_soft_acc, fut_batch_soft_acc, current_soft_acc = 0, 0, 0
+    ' hard accuracy: '
+    # positions < t ( aa positions)
+    prev_batch_hard_acc = batch_hard_acc(
+        seq1_list=prev_sample_split_chars,
+        seq2_list=prev_real_split_chars
+    )
+    # positions > t ( aa positions)
+    fut_batch_hard_acc = batch_hard_acc(
+        seq1_list=fut_sample_split_chars,
+        seq2_list=fut_real_split_chars
+    )
+    # positions = t (aa positions)
+    current_hard_acc = compute_hard_acc(
+        seq1=current_sample_chars,
+        seq2=current_real_chars
+    )
+    return (
+	prev_batch_hard_acc,
+	prev_batch_soft_acc,
+	fut_batch_hard_acc,
+	fut_batch_soft_acc,
+	current_hard_acc,
+	current_soft_acc
+    )
+@torch.no_grad()
+def compute_ppl_given_time_pos(
+        probs: torch.Tensor,
+        sample_path: torch.Tensor,
+        idx: torch.Tensor
+    ) -> (
+            float,
+            float,
+            float
+    ):
+        current_probs, prev_probs, fut_probs = time_split_on_seq(
+                probs.cpu(),
+                sample_seq_path=sample_path.cpu(),
+                idx=idx.cpu()
+        )
+        # ppl at the current time position (aa_i = t)
+        # current_ppl = compute_ppl(probs=torch.stack(current_probs).permute(0,2,1))
+        current_ppl = batch_compute_ppl(probs_list=current_probs)
+        # ppl at the prev and fut time positions (aa_i < t and aa_i > t)
+        prev_ppl = batch_compute_ppl(probs_list=prev_probs)
+        fut_ppl = batch_compute_ppl(probs_list=fut_probs)
+        return (
+                current_ppl,
+                prev_ppl,
+                fut_ppl
+        )

Stage3_source/helper_funcs.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pynvml import *
+"""
+To track memory allocation, let's take advantage of the nvidia-ml-py3 package and GPU memory allocation from python.
+ref: https://huggingface.co/docs/transformers/v4.20.1/en/perf_train_gpu_one
+"""
+def print_gpu_initialization():
+    nvmlInit()
+    handle = nvmlDeviceGetHandleByIndex(0)
+    info = nvmlDeviceGetMemoryInfo(handle)
+    print(f"GPU memory occupied: {info.used//1024**2} MB.")
+    return info.used // 1024**2
+def print_summary(result):
+    print(f"Time: {result.metrics['train_runtime']:.2f}")
+    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+    print_gpu_utilization()

Stage3_source/preprocess.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+from torchvision.datasets import MNIST
+from torchvision.transforms import Compose, ToTensor, Resize
+import torchvision.transforms as T
+#from numba import jit
+import numpy as np
+import pandas as pd
+def get_mnist_dataset(args:any) -> DataLoader:
+    if args.dataset == 'normal':
+        print(args.download)
+        transform = Compose([ToTensor(), Resize(args.image_size), lambda x: x > 0.5])
+        train_dataset = MNIST(root=args.data_root, download=True, transform=transform, train=True)
+        train_dataloader = DataLoader(
+                train_dataset,
+                num_workers=args.workers,
+                batch_size=args.batch_size,
+                shuffle=True,
+                pin_memory=True,
+                drop_last=True
+        )
+    elif args.dataset == 'sequence':
+        transform = Compose([ToTensor(), Resize(args.image_size), lambda x: x > 0.5, T.Lambda(lambda x: torch.flatten(x).unsqueeze(0))])
+        train_dataset = MNIST(root=args.data_root, download=True, transform=transform, train=True)
+        train_dataloader = DataLoader(
+                train_dataset,
+                num_workers=args.workers,
+                batch_size=args.batch_size,
+                shuffle=True,
+                pin_memory=True,
+                drop_last=True
+        )
+    else:
+        print('Please picker either normal or sequence')
+        quit()
+    return train_dataloader
+' Protein preprocessing tools '
+#@jit(nopython=True)
+def pad_ends(
+        seqs: list,
+        max_seq_length: int
+    ) -> list:
+    padded_seqs = [] # add padded gaps at the end of each sequence
+    for seq in seqs:
+        seq_length = len(seq)
+        # number of padded tokens
+        pad_need = max_seq_length - seq_length
+        # add number of padded tokens to the end
+        seq += '-'*pad_need
+        padded_seqs.append(seq)
+    return padded_seqs
+# create numerical represented sqeuences
+def create_num_seqs(seq_list: list) -> list:
+    # tokenizer
+    #tokens = ['*', '<START>', 'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','<END>', '-']
+    tokens = [ '<START>', 'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','<END>', '-']
+    # needed to lose these to the token list
+    tokens = tokens + ['X', 'U', 'Z', 'B', 'O']
+    token2int = {x:ii for ii, x in enumerate(tokens)}
+    # empty list to hold num rep. seqs.
+    num_seq_list = []
+    for seq in seq_list:
+        num_seq_list.append([token2int[aa] for aa in seq])
+    return num_seq_list
+# prepare the protein sequences
+def prepare_protein_data(
+        args: any,
+        data_dict: dict
+    ) -> (
+            list,
+            list
+    ):
+        print([key for key in data_dict.keys()])
+        print('Prepare dataset')
+        # prepare sequences
+        seq_list = [seq.replace('-','') for seq in data_dict[args.sequence_keyname]]
+        seq_list = [['<START>'] + list(seq) + ['<END>'] for seq in seq_list]
+        seq_lens = [len(seq) for seq in seq_list]
+        # Determine the maximum sequence length based on context window size
+        max_seq_len = int(args.diffusion_steps)
+        # Get indices of sequences that meet the criteria
+        valid_indices = [i for i, seq in enumerate(seq_list) if len(seq) <= max_seq_len]
+        # Filter num_seq_list based on these indices
+        filter_seq_list = [seq_list[i] for i in valid_indices]
+        max_seq_len = int(args.image_size * args.image_size)
+        padded_seq_list = pad_ends(
+                seqs=filter_seq_list,
+                max_seq_length=max_seq_len
+        )
+        num_seq_list = create_num_seqs(padded_seq_list) # numerical representations
+        # prepare class labels
+        #class_label_list = df.label.values.tolist()
+        if args.facilitator in ['MSE', 'MMD']:
+            text_emb = data_dict['text_to_protein_embedding']
+        elif args.facilitator in ['Default']:
+            text_emb = data_dict['text_embedding']
+        else:
+            raise ValueError(f"Unexpected value for 'facilitator': {args.facilitator}")
+        text_emb = [text_emb[i] for i in valid_indices]
+        # prune sequence and texts out based on length
+        print('Finished preparing dataset')
+        #
+        return (
+                num_seq_list,
+                text_emb
+        )
+class protein_dataset(Dataset):
+    """
+    Sequence dataloader
+    """
+    def __init__(
+            self,
+            num_seq_list: list,
+            text_emb: torch.Tensor
+    ):
+        if not torch.is_tensor(num_seq_list):
+            self.num_seqs = torch.tensor(num_seq_list).float()
+        else:
+            pass
+        self.text_emb = text_emb
+        #if not torch.is_tensor(class_label_list):
+        #    self.class_label = torch.tensor(class_label_list).float()
+    def __len__(self,):
+        """
+        number of samples total
+        """
+        return len(self.num_seqs)
+    def __getitem__(self, idx: any) -> (
+            torch.FloatTensor,
+            torch.FloatTensor
+    ):
+        """
+        extract adn return the data batch samples
+        """
+        # convert and return the data batch samples
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        # sequences
+        num_seqs = self.num_seqs[idx]
+        # class labels
+        text_emb = self.text_emb[idx]
+        return (
+                num_seqs,
+                text_emb
+        )

Stage3_source/sampling_analysis.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import os
+import numpy as np
+import random
+import pandas as pd
+import math
+from tqdm import tqdm
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import Stage3_source.preprocess as prep
+import Stage3_source.cond_diff_transformer_layer as mod
+import Stage3_source.transformer_training_helper as train_helper
+# generate missing pixels with one shot
+@torch.no_grad()
+def cond_autocomplete_real_samples(
+        model: nn.Module,
+        args: any,
+        realization: torch.Tensor,
+        y_c: torch.Tensor,
+        idx: torch.Tensor
+    ) -> (
+            any,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor
+    ):
+        model.eval()
+        bs, channel, seq_length = realization.size()
+        # get a batch of random sampling paths
+        sampled_random_path = train_helper.sample_random_path(bs, seq_length, device=args.device)
+        # create a mask that masks the locations where we've already sampled
+        random_path_mask = train_helper.create_mask_at_random_path_index(sampled_random_path, idx, bs, seq_length)
+        # tokenize realizations
+        real_tokens, bs, seq_length= train_helper.create_token_labels(args, realization)
+        #real_tokens = realization.clone().squeeze(1)
+        # mask realizations
+        real_token_masked =  train_helper.mask_realizations(real_tokens, random_path_mask)
+        # conditional probability
+        conditional_prob, probs = train_helper.cond_predict_conditional_prob(model, real_token_masked, y_c, idx, args)
+        # evaluate the value of the log probability for the given realization:
+        log_prob = train_helper.log_prob_of_realization(args, conditional_prob, real_tokens)
+        return (
+                conditional_prob,
+                probs.cpu(),
+                real_token_masked.cpu(),
+                real_tokens.cpu(),
+                log_prob.cpu(),
+                sampled_random_path.cpu(),
+                random_path_mask.cpu()
+        )
+# get the label for the corresponding sequence in the dataloader
+def extract_samples_with_labels(
+        dataloader: DataLoader,
+        target_labels: int,
+        total_num: int,
+        pad_included: bool=False
+    ) -> dict:
+    extracted_sampled = {
+            'sample': [],
+            'label': []
+    }
+    for data, labels in dataloader:
+        for i, label in enumerate(labels):
+            if label.item() == target_labels:
+                if pad_included:
+                    pass
+                else:
+                    data[i] += 1 # account for the absorbing state (i.e. make room)
+                extracted_sampled['sample'].append(data[i]) # account for abosrbed state
+                extracted_sampled['label'].append(label)
+                if len(extracted_sampled['label']) == total_num:
+                    return extracted_sampled
+    return extracted_sampled
+# mask a given percentage of the sample
+def corrupt_samples(
+        args: any,
+        realization: torch.Tensor,
+        perc: float
+    ) -> torch.Tensor:
+    bs, channels, seq_length = realization.size()
+    # number of samples to corrupt (i.e. idx)
+    idx = (args.diffusion_steps * torch.Tensor([perc])).to(int).to(args.device)
+    # get a batch of random sampling paths
+    sampled_random_path = train_helper.sample_random_path(bs, seq_length, device=args.device)
+    # we create a mask that masks the locations where we've already sampled
+    random_path_mask = train_helper.create_mask_at_random_path_index(sampled_random_path, idx, bs, seq_length)
+    # tokenize realizations
+    real_tokens, bs, seq_length= train_helper.create_token_labels(args, realization)
+    # mask realizations
+    real_token_masked = train_helper.mask_realizations(real_tokens, random_path_mask)
+    return (
+            real_token_masked,
+            sampled_random_path,
+            idx
+    )
+# inpaint missing regions by predicting the next position
+@torch.no_grad()
+def predict_next_index(
+        model: nn.Module,
+        args: any,
+        mask_realization: torch.Tensor,
+        y_c: torch.Tensor,
+        idx: torch.Tensor
+    ) -> (
+            any,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor
+    ):
+        model.eval()
+        bs, channel, seq_length = mask_realization.size()
+        # conditional prob
+        conditional_prob, probs = train_helper.cond_predict_conditional_prob(model, mask_realization.squeeze(1), y_c, idx, args)
+        return (
+                conditional_prob,
+                probs.cpu(),
+        )
+def generate_denoised_sampled(
+        args: any,
+        model: nn.Module,
+        extract_digit_samples: torch.Tensor,
+        extract_time: torch.Tensor,
+        extract_digit_label: torch.Tensor,
+        sampling_path: torch.Tensor
+    ) -> (
+            list,
+            list
+    ):
+        mask_realization_list, time_idx_list = [], []
+        # prepare data
+        temp_y_c = extract_digit_label.to(args.device)
+        temp_mask_realization = extract_digit_samples.unsqueeze(1).long().to(args.device)
+        temp_idx = torch.Tensor([extract_time]).to(args.device).squeeze(0)
+        temp_sampling_path = sampling_path.to(args.device)
+        for ii in tqdm(range(int(temp_idx.item()), args.diffusion_steps)):
+            # where we need to sample next
+            current_location = temp_sampling_path == temp_idx
+            print(current_location.shape)
+            # make position prediction
+            conditional_prob, prob  = predict_next_index(
+                    model=model,
+                    args=args,
+                    mask_realization=temp_mask_realization,
+                    y_c=temp_y_c,
+                    idx=temp_idx
+            )
+            # get the label for the next token position
+            next_temp_realization = torch.argmax(
+                    conditional_prob.sample(), dim=-1
+            )
+            temp_mask_realization[0, current_location] = next_temp_realization[current_location]
+            mask_realization_list.append(temp_mask_realization.cpu().numpy())
+            time_idx_list.append(temp_idx.cpu().numpy())
+            temp_idx+=1
+        return (
+                mask_realization_list,
+                time_idx_list
+        )
+def batch_generate_denoised_sampled(
+        args: any,
+        model: nn.Module,
+        extract_digit_samples: torch.Tensor,
+        extract_time: torch.Tensor,
+        extract_digit_label: torch.Tensor,
+        sampling_path: torch.Tensor
+    ) -> (list, list):
+    # Ensure batch dimension consistency across input tensors
+    assert extract_digit_samples.size(0) == extract_digit_label.size(0) == sampling_path.size(0) == extract_time.size(0), "Mismatched batch dimensions"
+    batch_size = extract_digit_samples.size(0)
+    mask_realization_list, time_idx_list = [], []
+    print('batch_size:', batch_size)
+    # Prepare data
+    temp_y_c = extract_digit_label.to(args.device)
+    temp_mask_realization = extract_digit_samples.unsqueeze(1).long().to(args.device)
+    temp_idx = extract_time.unsqueeze(-1).to(args.device)  # Adding an extra dimension for batch processing
+    temp_sampling_path = sampling_path.to(args.device)
+    print(f"Starting temp_idx: {temp_idx[0].item()}")
+    start_time_index = temp_idx[0].item() # assume all temp_idx is the same values
+    max_diffusion_step = args.diffusion_steps # max number of timesteps
+    for ii in tqdm(range(start_time_index, max_diffusion_step), initial=start_time_index, total=max_diffusion_step):
+        # Check if any temp_idx has reached or exceeded diffusion_steps
+        if torch.any(temp_idx >= args.diffusion_steps):
+            break
+        # Broadcast ii to match the batch size
+        current_ii = torch.full((batch_size,), ii, dtype=torch.long, device=args.device)
+        # Make position prediction
+        conditional_prob, prob = predict_next_index(
+                model=model,
+                args=args,
+                mask_realization=temp_mask_realization,
+                y_c=temp_y_c,
+                idx=temp_idx
+        )
+        # Get the label for the next token position
+        next_temp_realization = torch.argmax(conditional_prob.sample(), dim=-1)
+        # Update temp_mask_realization for each item in the batch
+        current_location = temp_sampling_path == temp_idx  # Adding an extra dimension for comparison
+        current_location = torch.argmax(current_location.detach().cpu()*1, dim=-1)
+        temp_mask_realization[:, 0, current_location] = next_temp_realization[:,current_location]
+        # Append results for each item in the batch
+        mask_realization_list.append(temp_mask_realization.cpu().numpy())
+        time_idx_list.append(temp_idx.cpu().numpy())
+        # Increment temp_idx for the next iteration
+        temp_idx += 1
+    return mask_realization_list, time_idx_list
+# convert sequence with numerical variables into character letters
+def convert_num_to_chars(
+        tokenizer: any,
+        num_seq: list
+    ) -> list:
+    char_seq = [tokenizer[num] for num in num_seq]
+    return "".join(char_seq)

Stage3_source/transformer_sampling_helper.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import itertools
+from pathlib import Path
+import numpy as np
+from tqdm.auto import tqdm
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.distributions import OneHotCategorical
+from torch.distributions import Categorical

Stage3_source/transformer_training_helper.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import itertools
+from pathlib import Path
+import numpy as np
+from tqdm.auto import tqdm
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.distributions import OneHotCategorical
+from torch.distributions import Categorical
+import Stage3_source.eval_metrics as eval_funcs
+# functions adapted for token-based transformers instead of Unet images (hat tip to author: LukasMosser)
+' sample random paths '
+def sample_random_path(
+        batch_size: int,
+        seq_length: int,
+        device: str='device'
+    ) -> torch.Tensor:
+    # create a batch of random sampling paths
+    random_paths = torch.stack(
+            [torch.randperm(seq_length, device=device) for _ in range(batch_size)],
+            axis=0
+    )
+    # sequential paths
+    #random_paths = torch.stack(
+    #        [torch.arange(seq_length, device=device) for _ in range(batch_size)],
+    #        axis=0
+    #)
+    return random_paths
+' create masks to indicate positions that we have sampled already '
+def create_mask_at_random_path_index(
+        sample_random_path: torch.Tensor,
+        idx: any,
+        batch_size: int,
+        seq_length: int
+    ) -> torch.Tensor:
+    # create a mask that has 1s everywhere we've sampled and 0's everywhere else
+    mask = (sample_random_path < idx)
+    return mask
+' create a (batched) mask of where we are now sampling '
+def create_sampling_location_mask(
+        sampled_random_path: torch.Tensor,
+        idx: any,
+        batch_size: int,
+        seq_length: int
+    ) -> torch.Tensor:
+    # create a binary mask that has 1 at the current location for us to sample
+    sampling_location_mask = (sampled_random_path == idx).long()
+    return sampling_location_mask
+' create masks to indicate positions beyond the current sampling position '
+def create_mask_at_future_path_index(
+        sampled_random_path: torch.Tensor,
+        idx: any,
+        batch_size: int,
+        seq_length: int
+    ) -> torch.Tensor:
+    # create a mask that has 1s everywhere were are not going to be sampling and
+    # 0's everywhere we previously and currently sampled
+    sampling_future_mask = (sampled_random_path > idx).long()
+    return sampling_future_mask
+' sampling from the probability distribution '
+def sample_from_conditional(conditional_prob: any) -> torch.Tensor:
+    # sample from the categorical dist.
+    return conditional_prob.sample().permute(0,2,1)
+' compute entropy of the model predicted probability distribution '
+def compute_entropy(conditional_prob: any) -> torch.Tensor:
+    # we can directly compute the entropy of the categorical distribution
+    return conditional_prob.entropy()
+' sampling the time trajectory '
+class exp_weight_time_sample:
+    def __init__(self, timesteps: int, decay_rate: float):
+        self.timesteps = timesteps
+        self.decay_rate = decay_rate
+        # compute the weight based on the exp function
+        self.weights = torch.tensor(
+                [torch.exp(-torch.tensor([i])*decay_rate) for i in range(self.timesteps)]
+        )
+        # normalize weights
+        self.weights /= self.weights.sum()
+    def sample(self, batch_size: int) -> torch.Tensor:
+        # generate random samples
+        samples = torch.multinomial(self.weights, batch_size, replacement=True)
+        return samples
+def sample_random_index_for_sampling(
+        batch_size: int,
+        seq_length: int,
+        device: str='cuda',
+        option: str='random'
+    ) -> any:
+    if option == 'random':
+        # sample a random index where we want to sample next
+        idx = torch.randint(
+                low=0,
+                high=seq_length+1,
+                size=(batch_size,1),
+                device=device,
+                requires_grad=False
+        )
+    elif option == 'weighted':
+        time_sampler = exp_weight_time_sampler(timesteps=seq_length+1, decay_rate=0.005)
+        # sample a weighted random index where we want to sample next
+        idx = time_sampler.sample(batch_size=batch_size).unsqueeze(1).to(device)
+    return idx
+#' log probs from realization '
+def log_prob_of_realization(
+        args: any,
+        conditional_prob: any,
+        real_tokens: torch.Tensor
+    ) -> torch.Tensor:
+    # compute the log-prob of a given realization
+    #log_prob = conditional_prob._categorical.log_prob(real_tokens.to(args.device))
+    log_prob = conditional_prob._categorical.log_prob(real_tokens)
+  #  log_prob = conditional_prob.log_prob(real_tokens.to(args.device))
+    return log_prob
+#' get the log probabilities of the unsampled locations '
+#def log_prob_of_unsampled_locations(
+#        log_prob: torch.Tensor,
+#        token_mask: torch.Tensor,
+#        real_tokens: torch.Tensor
+#    ) -> torch.Tensor:
+#
+#    # unsampled token positions (i.e. absorbing states)
+#    unsampled_mask = (token_mask == 0) * 1
+#    # non-padded tokens
+#    non_padded_mask = (real_tokens != 23) * 1
+#    # final mask is absorbing states that do not belong to padded tokens
+#    final_unsampled_mask = unsampled_mask & non_padded_mask
+#    # compute the total log prob of the unsampled locations, taking sum over log-probs
+#    log_prob_unsampled = ( final_unsampled_mask * log_prob)
+#    # sum log probs at absorbing positions
+#    summed_log_prob_unsampled = log_prob_unsampled.sum(1)
+#
+#    return summed_log_prob_unsampled
+' get the log probabilities of the unsampled locations '
+def log_prob_of_unsampled_locations(
+        log_prob: torch.Tensor,
+        token_mask: torch.Tensor
+        ) -> torch.Tensor:
+    # copmute the total log prob of the unsampled locations, taking sum over log-probs
+    log_prob_unsampled = ((token_mask == 0)*1 * log_prob)
+    return log_prob_unsampled.sum(1)
+' weight the unsampeld log probs '
+def weight_log_prob(
+        log_prob_unsampled: torch.Tensor,
+        idx: any,
+        seq_length
+    ) -> torch.Tensor:
+    # compute the average log-prob over the unsampled locations
+    log_prob_weighted = 1/(seq_length - idx.squeeze(1) + 1) * log_prob_unsampled
+    return log_prob_weighted
+' get mean log prob over the batch '
+def compute_average_loss_for_batch(log_prob_weighted: torch.Tensor) -> torch.Tensor:
+    # copute a (negative) average over the batch elements to copmute an unbiased estimator of the loss
+    loss = -log_prob_weighted.mean()
+    return loss
+' create the numerical tokenized input data for transformer '
+def create_token_labels(args, realization) -> (
+        torch.Tensor,
+        int,
+        int
+    ):
+    bs, channel, seq_length = realization.size()
+    temp_real = realization.reshape(bs, channel, seq_length)*1
+    if args.task == 'MNIST':
+        real_tokens = (temp_real == 1)*2 + (temp_real == 0)*1 # numerical tokeni labels for mnist
+    elif args.task == 'proteins':
+        real_tokens = temp_real + 1
+    # background --> label 1
+    # foreground --> label 2
+    # mask (absorbing state) --> label 0
+    return (
+            real_tokens.squeeze(1),
+            bs,
+            seq_length
+    )
+' mask the positions for predictions/denoising '
+def mask_realizations(
+        real_tokens: torch.Tensor,
+        random_path_mask: torch.Tensor
+    ) -> torch.Tensor:
+    out_real_tokens = real_tokens.clone()
+    # batch size
+    bs = random_path_mask.shape[0]
+    # convert random path to boolean
+    bool_rand_path_mask = random_path_mask.to(dtype=torch.bool)
+    # positional masks
+    # mask the future sample positions
+    future_mask_positions = ((~bool_rand_path_mask)*1).squeeze(1)
+    for ii in range(bs):
+        mask_positions = future_mask_positions[ii].nonzero().tolist()
+        # insert mask tokens
+        out_real_tokens[ii, mask_positions] = 0
+    return out_real_tokens
+' model prediction '
+def predict_conditional_prob(
+        model: nn.Module,
+        real_token_masked: torch.Tensor,
+        idx: any,
+        args: any
+    ) -> (
+            any,
+            torch.Tensor
+    ):
+        #logits = model(x=real_token_masked.to(args.device), t=idx.view(-1,))
+        logits = model(x=real_token_masked, t=idx.view(-1,))
+        probs = F.softmax(
+                logits,
+                dim=1
+        )
+        conditional_prob = OneHotCategorical(probs=probs.permute(0,2,1))
+        return (
+                conditional_prob,
+                probs
+        )
+"""
+Here, we compute the previous position tokens, current token position, and future token positions, where
+past, current, and future are defined by the time trajectory.
+"""
+' sample from model '
+@torch.no_grad()
+def sample_from_conditional(conditional_prob: any) -> torch.Tensor:
+    # draw a sample from the categorical dist.
+    cond_prob_sample = conditional_prob.sample().permute(0,2,1)
+    return cond_prob_sample
+' compute the accuracy at the current sampling location '
+@torch.no_grad()
+def sample_recover(
+        real_tokens: torch.Tensor,
+        cond_prob_sample: torch.Tensor,
+        current_path_mask: torch.Tensor
+    ) -> float:
+    # remove from gpu
+    real_tokens.cpu()
+    cond_prob_sample.cpu()
+    current_path_mask.cpu()
+    # current sampling index
+    current_tensor_pos = torch.argmax((current_path_mask == 1)*1, dim=-1)
+    # model predictions match the ground truth label at current sampling index
+    match_preds = [(
+        real_tokens[seq_idx, ii] == torch.argmax(cond_prob_sample, dim=1)[seq_idx, ii]
+        ).item()*1 for seq_idx, ii in enumerate(current_tensor_pos.cpu().numpy())
+    ]
+    return sum(match_preds)/len(match_preds)
+' compute the accuracy of previous conditionally sampled locations '
+@torch.no_grad()
+def compute_prev_token_acc(
+        cond_real_tokens: torch.Tensor,
+        cond_prob_sample: torch.Tensor,
+        path_mask: torch.Tensor
+    ) -> np.ndarray:
+    # remove from gpu
+    cond_real_tokens.cpu()
+    cond_prob_sample.cpu()
+    path_mask.cpu()
+    # class labels of the sampled model prediction
+    cond_sample_tokens = torch.argmax(cond_prob_sample, dim=1)
+    matches = []
+    for ii , sample_pos in enumerate(path_mask):
+        temp_real_tokens = cond_real_tokens[ii, sample_pos.nonzero()].squeeze(1)
+        temp_sample_tokens = cond_sample_tokens[ii, sample_pos.nonzero()].squeeze(1)
+        matches.append(
+                (temp_real_tokens == temp_sample_tokens).tolist()
+        )
+    acc = []
+    for match in matches:
+        try:
+            acc.append(sum(match*1)/len(match))
+        except ZeroDivisionError:
+            acc.append(0)
+    return np.mean(acc)
+' compute the accuracy of previous conditionally sampled locations '
+@torch.no_grad()
+def compute_future_token_acc(
+        cond_real_tokens: torch.Tensor,
+        cond_prob_sample: torch.Tensor,
+        path_mask: torch.Tensor
+    ) -> np.ndarray:
+    # remove from gpu
+    cond_real_tokens.cpu()
+    cond_prob_sample.cpu()
+    path_mask.cpu()
+    # class labels of the sampled model prediction
+    cond_sample_tokens = torch.argmax(cond_prob_sample, dim=1)
+    matches = []
+    for ii, sample_pos in enumerate(path_mask):
+        temp_real_tokens = cond_real_tokens[ii, sample_pos.nonzero()].squeeze(1)
+        temp_sample_tokens = cond_sample_tokens[ii, sample_pos.nonzero()].squeeze(1)
+        matches.append(
+                (temp_real_tokens == temp_sample_tokens).tolist()
+        )
+    acc = []
+    for match in matches:
+        try:
+            acc.append(sum(match*1)/len(match))
+        except ZeroDivisionError:
+            acc.append(0)
+        return np.mean(acc)
+@torch.no_grad()
+def compute_pos_entropy(probs: torch.Tensor) -> torch.Tensor:
+    # average positional entropy
+    pos_entropy = torch.mean(torch.mean(-probs * torch.log(probs), dim = 1), dim = 0)
+    return pos_entropy
+def elbo_objective(
+        model: nn.Module,
+        realization: torch.Tensor,
+        args: any
+        ) -> (
+                torch.Tensor,
+                float,
+                float,
+                float,
+                torch.Tensor
+        ):
+            bs, channel, seq_length = realization.size()
+            # get a batch of random sampling paths
+            sampled_random_path = sample_random_path(bs, seq_length, device=args.device)
+            # sample a set of random sampling steps for each individual training image in the current batch
+            idx = sample_random_index_for_sampling(bs, seq_length, device=args.device, option='random')
+            # we create a mask that masks the locations wher we've already sampled
+            random_path_mask = create_mask_at_random_path_index(sampled_random_path, idx, bs, seq_length)
+            # create a mask that masks the locations where are currently sampling
+            current_path_mask = create_sampling_location_mask(sampled_random_path, idx, bs, seq_length)
+            # future samplign locations (i.e. >t)
+            future_path_mask = create_mask_at_future_path_index(sampled_random_path, idx, bs, seq_length)
+            # tokenize realizations
+            real_tokens, bs, seq_length = create_token_labels(args, realization)
+            # mask realizations
+            real_token_masked = mask_realizations(real_tokens, random_path_mask)
+            # conditional probs
+            conditional_prob, probs = predict_conditional_prob(model, real_token_masked, idx, args)
+            # evaluate the value of the log prob for the given realization
+            log_prob = log_prob_of_realization(args, conditional_prob, real_tokens)
+            # compute an average over all the unsampled locations for each image in the batch
+            #log_prob_unsampled = log_prob_of_unsampled_locations(log_prob.to(args.device), real_token_masked.to(args.device))
+            log_prob_unsampled = log_prob_of_unsampled_locations(log_prob, real_token_masked)
+            # compute an average over all the unsampled locations for each image in the batch
+            log_prob_weighted = weight_log_prob(log_prob_unsampled, idx, seq_length)
+            # compute an average loss i.e. negative average log likelihood over teh batch elements
+            loss = compute_average_loss_for_batch(log_prob_weighted)
+            # compute metrics
+            cond_prob_sample = sample_from_conditional(conditional_prob)
+            acc = sample_recover(real_tokens, cond_prob_sample, current_path_mask)
+            prev_acc = compute_prev_token_acc(real_tokens, cond_prob_sample, random_path_mask)
+            future_acc = compute_future_token_acc(real_tokens, cond_prob_sample, future_path_mask)
+            # average positional entropy
+            pos_entropy = compute_pos_entropy(probs=probs)
+            return (
+                    loss,
+                    acc,
+                    prev_acc,
+                    future_acc,
+                    pos_entropy
+            )
+' model prediction with class conditional '
+def cond_predict_conditional_prob(
+        model: nn.Module,
+        real_token_masked: torch.Tensor,
+        y_c: torch.Tensor,
+        idx: any,
+        args: any
+    ) -> (
+            any,
+            torch.Tensor
+    ):
+        #logits = model(x=real_token_masked.to(args.device), t=idx.view(-1,), y_c=y_c)
+        logits = model(x=real_token_masked, t=idx.view(-1,), y_c=y_c)
+        probs = F.softmax(
+                logits,
+                dim=1
+        )
+        conditional_prob = OneHotCategorical(probs=probs.permute(0,2,1))
+  #     conditional_prob = Categorical(probs=probs.permute(0,2,1))
+        return (
+                conditional_prob,
+                probs
+        )
+def cond_elbo_objective(
+        model: nn.Module,
+        realization: torch.Tensor,
+        y_c: torch.Tensor,
+        args: any,
+        iteration: int
+        ) -> (
+                torch.Tensor,
+                tuple
+        ):
+            bs, channel, seq_length = realization.size()
+            # get a batch of random sampling paths
+            sampled_random_path = sample_random_path(bs, seq_length, device=args.device)
+            # sample a set of random sampling steps for each individual training samples in the current batch
+            idx = sample_random_index_for_sampling(bs, seq_length, device=args.device, option='random')
+            # we create a mask that masks the locations wher we've already sampled
+            random_path_mask = create_mask_at_random_path_index(sampled_random_path, idx, bs, seq_length)
+            # create a mask that masks the locations where are currently sampling
+            current_path_mask = create_sampling_location_mask(sampled_random_path, idx, bs, seq_length)
+            # future samplign locations (i.e. >t)
+            future_path_mask = create_mask_at_future_path_index(sampled_random_path, idx, bs, seq_length)
+            # tokenize realizations
+            real_tokens, bs, seq_length = create_token_labels(args,realization)
+            #real_tokens = realizations.clone().squeeze(1)
+            # mask realizations
+            real_token_masked = mask_realizations(real_tokens, random_path_mask)
+            # conditional probs
+            conditional_prob, probs = cond_predict_conditional_prob(model, real_token_masked, y_c, idx, args)
+            # evaluate the value of the log prob for the given realization
+            log_prob = log_prob_of_realization(args, conditional_prob, real_tokens)
+            # compute an average over all the unsampled locations for each image in the batch
+            #log_prob_unsampled = log_prob_of_unsampled_locations(log_prob.to(args.device), real_token_masked.to(args.device))
+            log_prob_unsampled = log_prob_of_unsampled_locations(log_prob, real_token_masked)
+            #log_prob_unsampled = log_prob_of_unsampled_locations(log_prob, real_token_masked, real_tokens)
+            # compute an average over all the unsampled locations for each image in the batch
+            log_prob_weighted = weight_log_prob(log_prob_unsampled, idx, seq_length)
+            # compute an average loss i.e. negative average log likelihood over teh batch elements
+            loss = compute_average_loss_for_batch(log_prob_weighted)
+            # compute metrics
+            if iteration % args.enter_eval == 0:
+                with torch.no_grad():
+                    # compute accuracy given time position
+                    sample_seq = torch.argmax(sample_from_conditional(conditional_prob), dim=1) # create numerical token sequences
+                    # convert to cpu
+                    real_tokens = real_tokens.cpu()
+                    sample_seq = sample_seq.cpu()
+                    idx = idx.cpu()
+                    sampled_random_path = sampled_random_path.cpu()
+                    probs = probs.cpu()
+                    prev_B_hard_acc, prev_B_soft_acc, fut_B_hard_acc, fut_B_soft_acc, current_B_hard_acc, current_B_soft_acc = eval_funcs.compute_acc_given_time_pos(
+                        real_tokens=real_tokens,
+                        sample_seq=sample_seq,
+                        sample_path=sampled_random_path,
+                        idx=idx
+                    )
+                    # copmute ppl given time position
+                    current_ppl, prev_ppl, fut_ppl = eval_funcs.compute_ppl_given_time_pos(
+                            probs=probs,
+                            sample_path=sampled_random_path,
+                            idx=idx
+                    )
+                    # average positional entropy
+                    pos_entropy = compute_pos_entropy(probs=probs).mean().item()
+                    metric_evals = (
+                            prev_B_hard_acc,
+                            prev_B_soft_acc,
+                            fut_B_hard_acc,
+                            fut_B_soft_acc,
+                            current_B_hard_acc,
+                            current_B_soft_acc,
+                            current_ppl,
+                            prev_ppl,
+                            fut_ppl,
+                            pos_entropy
+                    )
+            else:
+                metric_evals = (None)
+            return (
+                    loss,
+                    metric_evals
+            )

run_ProteoScribe_sample.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from argparse import Namespace
+import json
+import pandas as pd
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+import Stage3_source.PL_wrapper as Stage3_PL_mod
+import Stage3_source.cond_diff_transformer_layer as Stage3_mod
+import Stage3_source.sampling_analysis as Stage3_sample_tools
+import Stage3_source.animation_tools as Stage3_ani_tools
+# Step 1: Load JSON configuration
+def load_json_config(json_path):
+    """
+    Load JSON configuration file.
+    """
+    with open(json_path, "r") as f:
+        config = json.load(f)
+    # print("Loaded JSON config:", config)
+    return config
+# Step 2: Convert JSON dictionary to Namespace
+def convert_to_namespace(config_dict):
+    """
+    Recursively convert a dictionary to an argparse Namespace.
+    """
+    for key, value in config_dict.items():
+        if isinstance(value, dict):  # Recursively handle nested dictionaries
+            config_dict[key] = convert_to_namespace(value)
+    return Namespace(**config_dict)
+# Step 3: load model with pretrained weights
+def prepare_model(args, config_args) ->nn.Module:
+    """
+    Prepare the model and PyTorch Lightning Trainer using a flat args object.
+    """
+    # Initialize the model graph
+    model = Stage3_mod.get_model(
+        args=config_args,
+        data_shape=(config_args.image_size, config_args.image_size),
+        num_classes=config_args.num_classes
+    )
+    # Load state_dict into the model with map_location="cpu"
+    model.load_state_dict(torch.load(args.model_path, map_location=config_args.device))
+    model.eval()
+    print(f"Stage 3 model loaded from: {args.model_path} (loaded on {config_args.device})")
+    return model
+# Step 4: Sample sequences from the model
+@torch.no_grad()
+def batch_stage3_generate_sequences(
+        args: any,
+        model: nn.Module,
+        z_t: torch.Tensor
+    ) -> pd.Series:
+    """
+    Generates protein sequences in batches using a denoising model.
+    Args:
+        args (any): Configuration object containing model and sampling parameters.
+        model (nn.Module): The pre-trained model used for denoising and generation.
+        z_t (torch.Tensor): Input tensor representing initial samples for sequence generation.
+    Returns:
+        pd.Series: A dictionary containing generated sequences for each replica.
+    """
+    # Handle z_t if passed as a list of tensors
+    if isinstance(z_t, list) and all(isinstance(item, torch.Tensor) for item in z_t):
+        print(f"z_t is a list of tensors with {len(z_t)} tensors.")
+        z_t = torch.stack(z_t)
+    # Move model and inputs to the target device (CPU or CUDA)
+    model.to(args.device)
+    z_t = z_t.to(args.device)
+    # Amino acid tokenization including special characters
+    tokens = [
+        '-', '<START>', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
+        'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '<END>', '<PAD>',
+        'X', 'U', 'Z', 'B', 'O'  # Special characters
+    ]
+    # Initialize a dictionary to store generated sequences for each replica
+    design_sequence_dict = {f'replica_{ii}': [] for ii in range(args.num_replicas)}
+    # Loop over input samples (each z_t) and generate sequences
+    for idx_sample, z_text_sample in enumerate(z_t):
+        # Process in batches to optimize memory and speed
+        for batch_start in range(0, args.num_replicas, args.batch_size_sample):
+            current_batch_size = min(args.batch_size_sample, args.num_replicas - batch_start)
+            # Prepare batched input for current batch
+            batched_z_text_sample = z_text_sample.unsqueeze(0).repeat(current_batch_size, 1)
+            # Generate random permutations for each sample in the batch
+            batch_perms = torch.stack([torch.randperm(args.diffusion_steps) for _ in range(current_batch_size)])
+            # Generate denoised samples using the model
+            mask_realization_list, _ = Stage3_sample_tools.batch_generate_denoised_sampled(
+                args=args,
+                model=model,
+                extract_digit_samples=torch.zeros(current_batch_size, args.diffusion_steps),
+                extract_time=torch.zeros(current_batch_size).long(),
+                extract_digit_label=batched_z_text_sample,
+                sampling_path=batch_perms
+            )
+            # Convert generated numeric sequences to amino acid sequences
+            for i, mask_realization in enumerate(mask_realization_list[-1]):
+                design_sequence = Stage3_ani_tools.convert_num_to_char(tokens, mask_realization[0])
+                clean_sequence = design_sequence.replace('<START>', '').replace('<END>', '').replace('<PAD>', '')
+                design_sequence_dict[f'replica_{batch_start + i}'].append(clean_sequence)
+    return design_sequence_dict
+# Step 5: Argument Parser Function
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="BioM3 Inference Script (Stage 1)")
+    parser.add_argument('--json_path', type=str, required=True,
+                                    help="Path to the JSON configuration file (stage1_config.json)")
+    parser.add_argument('--model_path', type=str, required=True,
+                                    help="Path to the pre-trained model weights (pytorch_model.bin)")
+    parser.add_argument('--input_path', type=str, required=True,
+                                    help="Path to save input embeddings")
+    parser.add_argument('--output_path', type=str, required=True,
+                                    help="Path to save output embeddings")
+    return parser.parse_args()
+if __name__ == '__main__':
+    # Parse arguments
+    config_args_parser = parse_arguments()
+    # Load and convert JSON config
+    config_dict = load_json_config(config_args_parser.json_path)
+    config_args = convert_to_namespace(config_dict)
+    # load test dataset
+    embedding_dataset = torch.load(config_args_parser.input_path)
+    # load model
+    model = prepare_model(args=config_args_parser, config_args=config_args)
+    # sample sequences
+    design_sequence_dict = batch_stage3_generate_sequences(
+            args=config_args,
+            model=model,
+            z_t=embedding_dataset['z_c']
+    )
+    print(f'{design_sequence_dict=}')

stage3_config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "device": "cuda",
+  "output_hist_folder": "None",
+  "version_name": "None",
+  "output_folder": "./",
+  "save_hist_path": "None",
+  "tb_logger_path": "None",
+  "tb_logger_folder": "None",
+  "model_option": "transformer",
+  "model_path_checkpoint": "/project/ranganathanr/niksapraljak/HF_repo/HF_BioM3_project/V20240805_final_phase8/last-v2.ckpt",
+  "stage3_model_path": "/project/ranganathanr/niksapraljak/HF_repo/HF_BioM3_project/V20240805_final_phase8/last-v2.ckpt",
+  "stage2_data_path": "None",
+  "stage3_output_data_path": "None",
+  "data_root": "None",
+  "num_replicas": 5,
+  "batch_size_sample": 32,
+  "diffusion_steps": 1024,
+  "seed": 42,
+  "batch_size": 16,
+  "warmup_steps": 500,
+  "image_size": 32,
+  "learning_rate": 1e-4,
+  "weight_decay": 1e-6,
+  "ema_inv_gamma": 1.0,
+  "ema_power": 0.75,
+  "ema_max_value": 0.95,
+  "precision": "fp16",
+  "num_classes": 29,
+  "num_y_class_labels": 6,
+  "task": "proteins",
+  "enter_eval": 1000,
+  "choose_optim": "DeepSpeedCPUAdam",
+  "epochs": 1000,
+  "acc_grad_batches": 1,
+  "gpu_devices": 1,
+  "scheduler_gamma": "coswarmup",
+  "text_emb_dim": 512,
+  "facilitator": "MMD",
+  "context_window_size": 1024,
+  "sequence_keyname": "sequence",
+  "valid_size": 0.1,
+  "num_workers": 12,
+  "transformer_dim": 512,
+  "transformer_heads": 16,
+  "transformer_depth": 16,
+  "model_checkpoint": "/project/ranganathanr/niksapraljak/HF_repo/HF_BioM3_project/V20240805_final_phase8/last-v2.ckpt",
+  "data_path": "None",
+  "output_dict_path": "None",
+  "num_steps": 1,
+  "actnorm": false,
+  "perm_channel": "none",
+  "perm_length": "reverse",
+  "input_dp_rate": 0.0,
+  "transformer_blocks": 1,
+  "transformer_dropout": 0.1,
+  "transformer_reversible": false,
+  "transformer_local_heads": 8,
+  "transformer_local_size": 128
+}