Upload 10 files

Browse files

Files changed (10) hide show

classifier.py +490 -0
dataloader.py +692 -0
diffusion.py +1629 -0
eval_utils.py +90 -0
noise_schedule.py +160 -0
requirements.yaml +49 -0
sample.py +124 -0
tokenizer.py +279 -0
uncond_sample.py +116 -0
utils.py +86 -0

classifier.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import itertools
+import typing
+import hydra.utils
+import lightning as L
+import torch
+import torch.nn.functional as F
+import torchmetrics
+import transformers
+import dataloader
+import models.dit
+import noise_schedule
+class MicroAveragingMetric(torchmetrics.Metric):
+  """Micro-averaging metric.
+    Adapted from https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py#L12
+  """
+  def __init__(self, class_idx: typing.Optional[int] = 1,
+               dist_sync_on_step=False):
+    super().__init__(dist_sync_on_step=dist_sync_on_step)
+    self.class_idx = torch.tensor(class_idx) \
+      if class_idx is not None else None
+    self.add_state("numerator", default=torch.tensor(0.0),
+                   dist_reduce_fx="sum")
+    self.add_state("denominator", default=torch.tensor(0.0),
+                   dist_reduce_fx="sum")
+  def _update(
+      self, numerator, denominator, preds, y) -> tuple:
+    raise NotImplementedError
+  def update(self, logits: torch.Tensor, y: torch.Tensor):
+    # update metric states
+    preds = torch.argmax(logits, dim=-1)
+    y = y.view(-1)
+    assert preds.shape == y.shape, \
+      f"preds shape {preds.shape} != y shape {y.shape}"
+    self.numerator, self.denominator = self._update(
+      self.numerator, self.denominator, preds, y)
+  def compute(self):
+    # compute final result
+    value = self.numerator.float() / self.denominator \
+      if self.denominator.item() > 0. else torch.tensor(0.0)
+    return value
+  def reset(self):
+    self.numerator = torch.tensor(0.0).to(self.device)
+    self.denominator = torch.tensor(0.0).to(self.device)
+class CrossEntropy(MicroAveragingMetric):
+  """Calculates cross-entropy loss."""
+  def _update(
+      self, numerator, denominator, logits, y) -> tuple:
+    with torch.no_grad():
+      numerator += F.cross_entropy(
+        logits.view(-1, logits.size(-1)),
+        y.view(-1),
+        ignore_index=-100,
+        reduction='sum')
+      denominator += y.numel()
+    return numerator, denominator
+  # Overrides parent class to use logits and not (argmax) preds
+  def update(self, logits: torch.Tensor, y: torch.Tensor):
+    y = y.view(-1)
+    self.numerator, self.denominator = self._update(
+      self.numerator, self.denominator, logits, y)
+class Accuracy(MicroAveragingMetric):
+  """Calculates accuracy.
+    Can be used to calculate accuracy per class.
+    Copied from:
+      https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py
+  """
+  def _update(
+      self, numerator, denominator, preds, y) -> tuple:
+    if self.class_idx is None:
+      numerator += (preds == y).sum()
+      denominator += y.numel()
+    else:
+      class_idx = self.class_idx
+      relevant_idxs = (y == class_idx)
+      numerator += (preds[relevant_idxs] == class_idx).sum()
+      denominator += relevant_idxs.sum()
+      relevant_idxs = (y != class_idx)
+      numerator += (preds[relevant_idxs] != class_idx).sum()
+      denominator += relevant_idxs.sum()
+    return numerator, denominator
+class Precision(MicroAveragingMetric):
+  """Calculates precision.
+    Can be used to calculate precision per class.
+    Adapted from:
+      https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py
+  """
+  def _update(self, numerator, denominator, preds, y) -> tuple:
+    class_idx = self.class_idx
+    relevant_idxs = (preds == class_idx)
+    numerator += (y[relevant_idxs] == class_idx).sum()
+    denominator += relevant_idxs.sum()
+    return numerator, denominator
+class Recall(MicroAveragingMetric):
+  """Calculate recall.
+    Can be used to calculate recall per class.
+    Adapted from:
+      https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py
+  """
+  def _update(self, numerator, denominator, preds, y) -> tuple:
+    class_idx = self.class_idx
+    relevant_idxs = (y == class_idx)
+    numerator += (preds[relevant_idxs] == class_idx).sum()
+    denominator += relevant_idxs.sum()
+    return numerator, denominator
+class Classifier(L.LightningModule):
+  def __init__(
+      self,
+      config,
+      tokenizer: transformers.PreTrainedTokenizer,
+      pretrained_backbone: typing.Optional[torch.nn.Module] = None):
+    super().__init__()
+    self.save_hyperparameters(ignore=['pretrained_backbone'])
+    self.config = config
+    # This param indicates whether this model will be used
+    #  for guidance (False) or only evaluation (True).
+    self.is_eval_classifier = getattr(
+      config, 'is_eval_classifier', False)
+    self.tokenizer = tokenizer
+    self.vocab_size = tokenizer.vocab_size
+    self.antithetic_sampling = config.training.antithetic_sampling
+    self.importance_sampling = config.training.importance_sampling
+    self.change_of_variables = config.training.change_of_variables
+    if (not hasattr(self.tokenizer, 'mask_token')
+        or self.tokenizer.mask_token is None):
+      self.mask_index = self.vocab_size
+      self.vocab_size += 1
+    else:
+      self.mask_index = self.tokenizer.mask_token_id
+    if config.classifier_backbone == 'dit':
+      self.classifier_model = models.dit.DITClassifier(
+        self.config, vocab_size=self.vocab_size)
+    elif self.config.classifier_backbone == 'dimamba':
+      self.classifier_model = models.dimamba.DiMambaClassifier(
+        self.config, vocab_size=self.vocab_size,
+        pad_token_id=self.tokenizer.pad_token_id)
+    elif config.classifier_backbone == 'hyenadna':
+      hyena_config = transformers.AutoConfig.from_pretrained(
+        config.classifier_model.hyena_model_name_or_path,
+        n_layer=config.classifier_model.n_layer,
+        trust_remote_code=True
+      )
+      self.classifier_model = transformers.AutoModelForSequenceClassification.from_config(
+        hyena_config,
+        pretrained=False,
+        num_labels=config.data.num_classes,
+        problem_type='single_label_classification',
+        trust_remote_code=True
+      )
+    else:
+      raise NotImplementedError(
+        f"Classifier backbone "
+        f"{self.config.classifier_backbone} not "
+        f"implemented.")
+    if pretrained_backbone is not None:  # For PPLM / NOS
+      self.classifier_model.load_pretrained_encoder(
+        pretrained_backbone)
+    # Metrics are automatically reset at end of epoch
+    metrics = torchmetrics.MetricCollection({
+      'cross_entropy': CrossEntropy(),
+      'accuracy': Accuracy(class_idx=None),
+    })
+    if config.data.num_classes > 2:
+      for c in range(config.data.num_classes):
+        metrics.add_metrics(
+          {f"accuracy_class{c}": Accuracy(class_idx=c),
+           f"precision_class{c}": Precision(class_idx=c),
+           f"recall_class{c}": Recall(class_idx=c)})
+    else:
+      metrics.add_metrics(
+        {'precision': Precision(class_idx=1),
+         'recall': Recall(class_idx=1)})
+    metrics.set_dtype(torch.float64)
+    self.train_metrics = metrics.clone(prefix='train/')
+    self.valid_metrics = metrics.clone(prefix='val/')
+    self.T = config.T
+    self.noise = noise_schedule.get_noise(config,
+                                          dtype=self.dtype)
+    self.sampling_eps = config.training.sampling_eps
+    self.lr = config.optim.lr
+    self.time_conditioning = config.time_conditioning
+    self.fast_forward_epochs = None
+    self.fast_forward_batches = None
+  def on_load_checkpoint(self, checkpoint):
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
+    self.fast_forward_epochs = checkpoint['loops'][
+      'fit_loop']['epoch_progress']['current']['completed']
+    self.fast_forward_batches = checkpoint['loops'][
+      'fit_loop']['epoch_loop.batch_progress'][
+      'current']['completed']
+  def on_save_checkpoint(self, checkpoint):
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py
+    # ['epoch_loop.batch_progress']['total']['completed'] is
+    #  1 iteration behind, so we're using the optimizer's
+    #  progress.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['total'][
+      'completed'] = checkpoint['loops']['fit_loop'][
+                       'epoch_loop.automatic_optimization.optim_progress'][
+                       'optimizer']['step']['total'][
+                       'completed'] * self.trainer.accumulate_grad_batches
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['current'][
+      'completed'] = checkpoint['loops']['fit_loop'][
+                       'epoch_loop.automatic_optimization.optim_progress'][
+                       'optimizer']['step']['current'][
+                       'completed'] * self.trainer.accumulate_grad_batches
+    # _batches_that_stepped tracks the number of global
+    # steps, not the number of local steps, so we don't
+    # multiply with self.trainer.accumulate_grad_batches
+    # here.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.state_dict'][
+      '_batches_that_stepped'] = \
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.automatic_optimization.optim_progress'][
+      'optimizer']['step']['total']['completed']
+    if 'sampler' not in checkpoint.keys():
+      checkpoint['sampler'] = {}
+    if hasattr(self.trainer.train_dataloader.sampler,
+               'state_dict'):
+      sampler_state_dict = self.trainer. \
+        train_dataloader.sampler.state_dict()
+      checkpoint['sampler'][
+        'random_state'] = sampler_state_dict.get(
+        'random_state', None)
+    else:
+      checkpoint['sampler']['random_state'] = None
+  def on_train_start(self):
+    # Adapted from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+    distributed = (
+        self.trainer._accelerator_connector.use_distributed_sampler
+        and self.trainer._accelerator_connector.is_distributed)
+    if distributed:
+      sampler_cls = dataloader.FaultTolerantDistributedSampler
+    else:
+      sampler_cls = dataloader.RandomFaultTolerantSampler
+    updated_dls = []
+    for dl in self.trainer.fit_loop._combined_loader.flattened:
+      if hasattr(dl.sampler, 'shuffle'):
+        dl_sampler = sampler_cls(
+          dl.dataset, shuffle=dl.sampler.shuffle)
+      else:
+        dl_sampler = sampler_cls(dl.dataset)
+      if (distributed
+          and self.fast_forward_epochs is not None
+          and self.fast_forward_batches is not None):
+        dl_sampler.load_state_dict({
+          'epoch': self.fast_forward_epochs,
+          'counter': (self.fast_forward_batches
+                      * self.config.loader.batch_size)})
+      updated_dls.append(
+        torch.utils.data.DataLoader(
+          dl.dataset,
+          batch_size=self.config.loader.batch_size,
+          num_workers=self.config.loader.num_workers,
+          pin_memory=self.config.loader.pin_memory,
+          sampler=dl_sampler,
+          shuffle=False,
+          persistent_workers=self.config.loader.persistent_workers
+        ))
+    self.trainer.fit_loop._combined_loader.flattened = updated_dls
+  def forward(self, x, sigma=None, x_emb=None, attention_mask=None):
+    """Returns logits.
+      x_emb can be provided during PPLM / NoS-style guidance
+      (see: https://arxiv.org/abs/2305.20009).
+    """
+    if self.is_eval_classifier:
+      logits = self.classifier_model(x)
+      if hasattr(logits, 'logits'):
+        logits = logits.logits
+    else:
+      sigma = self._process_sigma(sigma) if sigma is not None else sigma
+      with torch.cuda.amp.autocast(dtype=torch.float32):
+        logits = self.classifier_model(x, sigma, x_emb=x_emb, attention_mask=attention_mask)
+    return logits
+  def get_log_probs(self, x, sigma, x_emb=None):
+    """Returns log probabilities.
+      Use for CBG-style guidance.
+    """
+    if self.is_eval_classifier:
+      raise NotImplementedError(
+        '`get_log_prob` not implemented for classifiers '
+        'that are meant to be used for evaluation purposes '
+        'only.')
+    with torch.cuda.amp.autocast(dtype=torch.float32):
+      return torch.nn.functional.log_softmax(
+        self.forward(x, sigma, x_emb=x_emb), dim=-1)
+  def training_step(self, batch, batch_idx):
+    loss = self._compute_loss(batch, prefix='train')
+    self.log(name='trainer/loss',
+             value=loss.item(),
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True,
+             prog_bar=True)
+    self.log(name='lr',
+             value=
+             self.trainer.optimizers[0].param_groups[0][
+               'lr'],
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True,
+             prog_bar=True, logger=False)
+    return loss
+  def validation_step(self, batch, batch_idx):
+    return self._compute_loss(batch, prefix='val')
+  def configure_optimizers(self):
+    # TODO(yair): Lightning currently giving this warning when using `fp16`:
+    #  "Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+    #  Not clear if this is a problem or not.
+    #  See: https://github.com/Lightning-AI/pytorch-lightning/issues/5558
+    optimizer = torch.optim.AdamW(
+      itertools.chain(self.classifier_model.parameters(),
+                      self.noise.parameters()),
+      lr=self.config.optim.lr,
+      betas=(self.config.optim.beta1,
+             self.config.optim.beta2),
+      eps=self.config.optim.eps,
+      weight_decay=self.config.optim.weight_decay)
+    scheduler = hydra.utils.instantiate(
+      self.config.lr_scheduler, optimizer=optimizer)
+    scheduler_dict = {
+      'scheduler': scheduler,
+      'interval': 'step',
+      'monitor': 'val/loss',
+      'name': 'trainer/lr',
+    }
+    return [optimizer], [scheduler_dict]
+  def _q_xt(self, x, move_chance):
+    """Computes the noisy sample xt.
+    Args:
+      x: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      move_chance: float torch.Tensor with shape
+        (batch_size, 1).
+    """
+    move_indices = torch.rand(
+      *x.shape, device=x.device) < move_chance
+    if self.config.diffusion == 'absorbing_state':
+      return torch.where(move_indices, self.mask_index, x)
+    if self.config.diffusion == 'uniform':
+      uniform_tensor = torch.randint(
+        0, self.vocab_size, x.shape, device=x.device)
+      return torch.where(move_indices, uniform_tensor, x)
+    raise NotImplementedError(
+        f'Diffusion type {self.config.diffusion} not '
+        'implemented.')
+  def _compute_loss(self, batch, prefix):
+    x0 = batch['input_ids']
+    attention_mask = batch['attention_mask']
+    t = None
+    if self.is_eval_classifier:
+      logits = self.forward(x0)
+    elif self.config.parameterization == 'ar':
+      # do not add noise for AR FUDGE and AR PPLM
+      logits = self.forward(
+        x0, attention_mask=attention_mask)
+    else:
+      t = self._sample_t(x0.shape[0])
+      if self.T > 0:
+        t = (t * self.T).to(torch.int)
+        t = t / self.T
+        # t \in {1/T, 2/T, ..., 1}
+        t += (1 / self.T)
+      if self.change_of_variables:
+        time_conditioning = t[:, None]
+        f_T = torch.log1p(- torch.exp(- self.noise.sigma_max))
+        f_0 = torch.log1p(- torch.exp(- self.noise.sigma_min))
+        move_chance = torch.exp(f_0 + t * (f_T - f_0))
+        move_chance = move_chance[:, None]
+      else:
+        sigma, _ = self.noise(t)
+        time_conditioning = sigma[:, None]
+        move_chance = 1 - torch.exp(-sigma[:, None])
+      xt = self._q_xt(x0, move_chance)
+      logits = self.forward(xt, time_conditioning, attention_mask=attention_mask)
+    if hasattr(self.config.data, 'label_col'):
+      if f"{self.config.data.label_col}_threshold" in batch:
+        y = batch[f"{self.config.data.label_col}_threshold"]
+      else:
+        y = batch[self.config.data.label_col]
+    else:
+      y = batch['label']
+    if (not self.is_eval_classifier
+        and getattr(self.config.training, 'use_label_smoothing', False)):
+      # Interpolate between one-hot and uniform distribution
+      labels = (torch.nn.functional.one_hot(y, self.config.data.num_classes) * (1 - t)[..., None] +
+           (1 / self.config.data.num_classes) * t[..., None])
+    else:
+      labels = y.view(-1)
+    if getattr(self.config, 'is_fudge_classifier', False):
+      expanded_y = y.unsqueeze(1).expand(-1, logits.shape[1]) # batch x seq
+      logits = logits.view(-1, self.config.data.num_classes)[attention_mask.flatten()==1, ...]
+      y = expanded_y.flatten().long()[attention_mask.flatten()==1]
+      loss = torch.nn.functional.cross_entropy(
+        logits,
+        y,
+        ignore_index=-100,
+        reduction='mean')
+    else:
+      loss = torch.nn.functional.cross_entropy(
+        logits.view(-1, logits.size(-1)),
+        labels,
+        ignore_index=-100,
+        reduction='mean')
+    if prefix == 'train':
+      self.train_metrics.update(logits, y)
+      metrics = self.train_metrics
+    elif prefix == 'val':
+      self.valid_metrics.update(logits, y)
+      metrics = self.valid_metrics
+    elif prefix == 'test':
+      self.test_metrics.update(logits, y)
+      metrics = self.test_metrics
+    else:
+      raise ValueError(f'Invalid prefix: {prefix}')
+    self.log_dict(metrics,
+                  on_step=False,
+                  on_epoch=True,
+                  sync_dist=True)
+    return loss
+  def _sample_t(self, n):
+    _eps_t = torch.rand(n, device=self.device)
+    if self.antithetic_sampling:
+      offset = torch.arange(n, device=self.device) / n
+      _eps_t = (_eps_t / n + offset) % 1
+    t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
+    if self.importance_sampling:
+      return self.noise.importance_sampling_transformation(
+        t)
+    return t
+  def _process_sigma(self, sigma):
+    if sigma.ndim > 1:
+      sigma = sigma.squeeze(-1)
+    if not self.time_conditioning:
+      sigma = torch.zeros_like(sigma)
+    assert sigma.ndim == 1, sigma.shape
+    return sigma

dataloader.py ADDED Viewed

	@@ -0,0 +1,692 @@

+import functools
+import itertools
+import math
+import os
+import re
+import shutil
+import typing
+import urllib
+import zipfile
+import datasets
+import fsspec
+import numpy as np
+import tokenizers
+import torch
+import transformers
+import lightning as L
+from torch.utils.data import DataLoader, Subset
+from functools import partial
+import pdb
+import custom_datasets.discretized_cifar10
+import custom_datasets.ten_species_dataset
+import utils
+LOGGER = utils.get_logger(__name__)
+# noinspection RegExpRedundantEscape
+def lm1b_detokenizer(x):
+  x = x.replace('http : / / ', 'http://')
+  x = x.replace('https : / / ', 'https://')
+  x = re.sub(r' \'(\w+)', r"'\1", x)
+  x = re.sub(r' (\w+) \. ', r' \1. ', x)
+  x = re.sub(r' (\w+) \.$', r' \1.', x)
+  x = x.replace(' ? ', '? ')
+  x = re.sub(r' \?$', '?', x)
+  x = x.replace(' ! ', '! ')
+  x = re.sub(r' \!$', '!', x)
+  x = x.replace(' , ', ', ')
+  x = x.replace(' : ', ': ')
+  x = x.replace(' ; ', '; ')
+  x = x.replace(' / ', '/')
+  x = re.sub(r'\" ([^\"]+) \"', r'"\1"', x)
+  x = re.sub(r'\' ([^\']+) \'', r"'\1'", x)
+  x = re.sub(r'\( ([^\(\)]+) \)', r"(\1)", x)
+  x = re.sub(r'\[ ([^\[\]]+) \]', r"[\1]", x)
+  x = x.replace('$ ', '$')
+  x = x.replace('£ ', '£')
+  return x
+class Text8Tokenizer(transformers.PreTrainedTokenizer):
+  def __init__(
+    self,
+    bos_token='[BOS]',
+    eos_token='[EOS]',
+    sep_token='[SEP]',
+    cls_token='[CLS]',
+    pad_token='[PAD]',
+    mask_token='[MASK]',
+    unk_token='[UNK]',
+    **kwargs):
+    self.characters = list('abcdefghijklmnopqrstuvwxyz ')
+    self._vocab_str_to_int = {
+      '[CLS]': 0,
+      '[SEP]': 1,
+      '[BOS]': 2,
+      '[EOS]': 3,
+      '[MASK]': 4,
+      '[PAD]': 5,
+      '[RESERVED]': 6,
+      '[UNK]': 7,
+      ** {ch: i + 8 for i, ch in enumerate(self.characters)}}
+    self._vocab_int_to_str = {
+      v: k for k, v in self._vocab_str_to_int.items()}
+    super().__init__(
+      bos_token=bos_token,
+      eos_token=eos_token,
+      sep_token=sep_token,
+      cls_token=cls_token,
+      pad_token=pad_token,
+      mask_token=mask_token,
+      unk_token=unk_token,
+      **kwargs)
+  @property
+  def vocab_size(self) -> int:
+    return len(self._vocab_str_to_int)
+  def _tokenize(self, text: str, **kwargs) -> typing.List[str]:
+    return list(text.lower())
+  def _convert_token_to_id(self, token: str) -> int:
+    return self._vocab_str_to_int.get(
+      token, self._vocab_str_to_int['[UNK]'])
+  def _convert_id_to_token(self, index: int) -> str:
+    return self._vocab_int_to_str[index]
+  def convert_tokens_to_string(self, tokens):
+    return ''.join(tokens)
+  def get_vocab(self) -> typing.Dict[str, int]:
+    return self._vocab_str_to_int
+def get_text8_dataset(cache_dir, max_seq_length=256,
+                      drop_last=True, crop_train=False):
+  """Adapted from:
+    https://github.com/google-research/google-research/blob/master/d3pm/text/datasets.py#L344
+    Args:
+      cache_dir: str, path to cache directory.
+      max_seq_length: int, maximum length of sequences.
+          (default: 256, as in D3PM codebase.)
+      drop_last: bool, whether to drop the last incomplete
+          batch. (default: True, as in D3PM codebase.)
+      crop_train: bool, whether to subsample contiguous
+          subsequences from training example. serves to
+          make sure transformer models with absolute position
+          embeddings do not have incorrect position-wise
+          marginals. (default: False, but necessary to match D3PM AR)
+    Returns:
+      dataset: dataset.DatasetDict, with keys 'train',
+          'valid', 'test'.
+  """
+  url = 'http://mattmahoney.net/dc/text8.zip'
+  if not crop_train:
+    cache_dir = f'{cache_dir}/text8'
+  else:
+    cache_dir = f'{cache_dir}/text8-crop-train'
+  split_names = ['train', 'validation', 'test']
+  if not all([
+    utils.fsspec_exists(os.path.join(cache_dir, split))
+    for split in split_names
+  ]):
+    # Check if raw data exists
+    raw_cache_dir = os.path.join(cache_dir, 'raw_data')
+    if not all([
+      utils.fsspec_exists(
+        os.path.join(raw_cache_dir, f'text8.{split}.txt'))
+      for split in split_names
+    ]):
+      if not utils.fsspec_exists(
+        os.path.join(raw_cache_dir, 'text8.zip')):
+        utils.fsspec_mkdirs(raw_cache_dir, exist_ok=True)
+        LOGGER.info('Downloading text8 from URL {}.'.format(url))
+        with (urllib.request.urlopen(url) as in_stream,
+              open(os.path.join(raw_cache_dir, 'text8.zip'),
+                   'wb') as out_file):
+          shutil.copyfileobj(in_stream, out_file)
+      with fsspec.open(
+        os.path.join(raw_cache_dir, 'text8.zip'),
+        'rb') as f:
+        rawdata = zipfile.ZipFile(f).read(
+          'text8').decode('utf-8')
+      # Splits taken from D3PM codebase
+      splits = {
+        'train': rawdata[:90_000_000],
+        'validation': rawdata[90_000_000: 95_000_000],
+        'test': rawdata[95_000_000:],
+      }
+      for split, data in splits.items():
+        _path = os.path.join(raw_cache_dir,
+                             f'text8.{split}.txt')
+        with fsspec.open(_path, 'w') as f:
+          f.write(data)
+    else:
+      splits = {}
+      for split in split_names:
+        _path = os.path.join(raw_cache_dir,
+                             f'text8.{split}.txt')
+        with fsspec.open(_path, 'r') as f:
+          splits[split] = f.read()
+    # Chunk and save as datasets.DatasetDict
+    def chunks(lst, n):
+      """Yield successive n-sized chunks from lst."""
+      for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+    dataset_dict = {}
+    for k, v in splits.items():
+      if k == 'train' and crop_train == True:
+        chunk_size = 2 * max_seq_length
+      else:
+        chunk_size = max_seq_length
+      text = list(chunks(v, chunk_size))
+      if drop_last and len(text[-1]) < chunk_size:
+        text = text[:-1]
+      dataset_dict[k] = datasets.Dataset.from_dict({'text': text})
+    dataset = datasets.DatasetDict(dataset_dict)
+    dataset.save_to_disk(cache_dir)
+  else:
+    dataset = datasets.load_from_disk(cache_dir)
+  return dataset
+def _group_texts(examples, block_size, bos, eos,
+                 add_special_tokens=True):
+  # Concatenate all texts.
+  concatenated_examples = list(itertools.chain(* examples['input_ids']))
+  total_length = len(concatenated_examples)
+  # TODO(yair): look into not dropping the remainder but rather padding it.
+  # We drop the small remainder, and if the total_length < block_size - 2
+  # we exclude this batch and return an empty dict.
+  # We could add padding if the model supported it instead of
+  # this drop, you can customize this part to your needs.
+  # `-2` to account for [BOS] and [EOS] to be added below
+  new_block_size = block_size - (2 if add_special_tokens else 0)
+  total_length = (total_length // new_block_size) * new_block_size
+  # Split by chunks of max_len.
+  result = {}
+  _values = []
+  _attn_masks = []
+  for i in range(0, total_length, new_block_size):
+    if add_special_tokens:
+      _values.append(
+        [bos]
+        + concatenated_examples[i : i + new_block_size]
+        + [eos])
+    else:
+      _values.append(
+        concatenated_examples[i: i + new_block_size])
+    _attn_masks.append(torch.ones(block_size))
+  result['input_ids'] = _values
+  result['attention_mask'] = _attn_masks
+  return result
+def get_dataset(
+    dataset_name, tokenizer, wrap, mode, cache_dir,
+    block_size=1024, num_proc=len(os.sched_getaffinity(0)),
+    streaming=False, override_cache=False,
+    add_special_tokens=True,
+    label_col=None, label_threshold=None):
+  if label_col is not None:
+    label_suffix = f'_label-{label_col}'
+    if label_threshold is not None:
+      label_suffix += f'_threshold-{label_threshold}'
+  else:
+    label_suffix = ''
+  if wrap:
+    filename = f'{dataset_name}_{mode}_bs{block_size}_wrapped{label_suffix}.dat'
+  else:
+    filename = f'{dataset_name}_{mode}_bs{block_size}_unwrapped{label_suffix}.dat'
+  _path = os.path.join(cache_dir, filename)
+  if utils.fsspec_exists(_path) and not override_cache:
+    LOGGER.info(f'Loading data from: {_path}')
+    return datasets.load_from_disk(_path).with_format('torch')
+  LOGGER.info(f'Generating new data at: {_path}')
+  crop_train = dataset_name == 'text8-crop'
+  if mode == 'train' and crop_train:
+    # double block size for subsampling
+    block_size *= 2
+  if dataset_name == 'text8':
+    assert wrap
+    dataset = get_text8_dataset(
+      cache_dir, max_seq_length=block_size)
+  elif dataset_name == 'amazon_polarity':
+    dataset = datasets.load_dataset(
+      'amazon_polarity',
+      cache_dir=cache_dir,
+      streaming=streaming)
+  elif dataset_name == 'qm9':
+    dataset = datasets.load_dataset(
+      'yairschiff/qm9',
+      cache_dir=cache_dir,
+      streaming=streaming,
+      split='train')  # Dataset only has 'train' split
+    if label_threshold is not None:
+      pctiles = label_threshold if isinstance(label_threshold, list) \
+        else [label_threshold]
+      pctile_values = np.percentile(dataset[label_col],
+                                    q=pctiles)
+      threshold = np.ones(len(dataset[label_col])) * len(pctiles)
+      for i, p in reversed(list(enumerate(sorted(pctile_values)))):
+        threshold[dataset[label_col] <= p] = i
+      dataset = dataset.add_column(
+        f"{label_col}_threshold", threshold.astype(int))
+      label_col = f"{label_col}_threshold"
+    dataset = dataset.train_test_split(
+      test_size=0.05, seed=42)  # hard-coded seed & size
+    dataset = dataset[mode]
+  elif dataset_name == 'ten_species':
+    return custom_datasets.ten_species_dataset.TenSpeciesDataset(
+      split=mode,
+      tokenizer=tokenizer,
+      max_length=block_size,
+      rc_aug=False,  # TODO: find way to pass this
+      add_special_tokens=add_special_tokens)
+  else:
+    dataset = datasets.load_dataset(
+      dataset_name,
+      cache_dir=cache_dir,
+      streaming=streaming)
+  if dataset_name == 'qm9':
+    data = dataset
+  else:
+    data = dataset[mode]
+  if dataset_name == 'lm1b':
+    detokenizer = lm1b_detokenizer
+  else:
+    detokenizer = None
+  def _apply_detokenizer(detoker):
+    def detok(text):
+      for j, t in enumerate(text, 0):
+        text[j] = detoker(t)
+      return text
+    return detok
+  EOS = tokenizer.encode(tokenizer.eos_token)[0]
+  BOS = tokenizer.encode(tokenizer.bos_token)[0]
+  def preprocess_and_tokenize(example):
+    if 'amazon_polarity' in dataset_name:
+      text = example['content']
+    elif 'qm9' in dataset_name:
+      text = example['canonical_smiles']
+    elif dataset_name == 'ten_species':
+      text = example['sequence']
+    else:
+      text = example['text']
+    if detokenizer is not None:
+      text = _apply_detokenizer(detokenizer)(text)
+    tokenizer.padding_side = 'right'
+    tokenizer.truncation_side = 'right'
+    if wrap:
+      tokens = tokenizer(text,
+                         add_special_tokens=False,
+                         return_attention_mask=False,
+                         return_token_type_ids=False)
+      if add_special_tokens:
+        tokens = {'input_ids':
+                  [t + [EOS] for t in tokens['input_ids']]}
+        # Still missing BOS; will be added in group_texts
+      else:
+        tokens = {'input_ids': tokens['input_ids']}
+    else:
+      tokens = tokenizer(text,
+                         max_length=block_size,
+                         padding='max_length',
+                         truncation=True,
+                         add_special_tokens=add_special_tokens,
+                         return_attention_mask=True,
+                         return_token_type_ids=add_special_tokens)
+    return tokens
+  if streaming:
+    tokenized_dataset = data.map(
+      preprocess_and_tokenize,
+      batched=True,
+      desc='Tokenizing')
+  else:
+    tokenized_dataset = data.map(
+      preprocess_and_tokenize,
+      batched=True,
+      num_proc=num_proc,
+      load_from_cache_file=True,
+      desc='Tokenizing')
+  keep_cols = ['input_ids', 'token_type_ids',
+               'attention_mask']
+  if label_col is not None:
+    keep_cols.append(label_col)
+  tokenized_dataset = tokenized_dataset.remove_columns(
+    [col for col in tokenized_dataset.column_names
+     if col not in keep_cols])
+  if not wrap:
+    tokenized_dataset.save_to_disk(_path)
+    return tokenized_dataset.with_format('torch')
+  group_texts = functools.partial(
+    _group_texts, block_size=block_size, bos=BOS, eos=EOS,
+    add_special_tokens=add_special_tokens)
+  if streaming:
+    chunked_dataset = tokenized_dataset.map(
+      group_texts,
+      batched=True,
+      desc='Grouping')
+  else:
+    chunked_dataset = tokenized_dataset.map(
+      group_texts,
+      batched=True,
+      num_proc=num_proc,
+      load_from_cache_file=True,
+      desc='Grouping')
+    chunked_dataset.save_to_disk(_path)
+  chunked_dataset = chunked_dataset.with_format('torch')
+  return chunked_dataset
+def get_tokenizer(config):
+  if config.data.tokenizer_name_or_path == 'text8':
+    tokenizer = Text8Tokenizer()
+  elif config.data.tokenizer_name_or_path == 'bert-base-uncased':
+    tokenizer = transformers.BertTokenizer.\
+      from_pretrained('bert-base-uncased')
+  elif config.data.tokenizer_name_or_path == 'raw_pixels':
+    tokenizer = custom_datasets.discretized_cifar10.DummyVisionTokenizer(
+      256, 32,
+      add_mask_token=config.data.add_mask_token,
+      add_special_tokens=config.data.add_special_tokens)
+  else:
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+      config.data.tokenizer_name_or_path,
+      trust_remote_code=True)
+  if (isinstance(tokenizer, transformers.GPT2TokenizerFast)
+      or isinstance(tokenizer, transformers.GPT2Tokenizer)):
+    tokenizer._tokenizer.post_processor = tokenizers.processors.BertProcessing(
+      (tokenizer.bos_token, tokenizer.bos_token_id),
+      (tokenizer.eos_token, tokenizer.eos_token_id))
+  # For wrapped batches:
+  #  [BOS] sent1 [EOS] sent2-fragment [EOS]
+  #  [BOS] sent2-fragment [EOS] sent3 [EOS]
+  if tokenizer.bos_token is None:
+    if tokenizer.cls_token is None:
+      raise AttributeError(
+        'Tokenizer must have a bos_token or '
+        f'cls_token: {tokenizer}')
+    tokenizer.bos_token = tokenizer.cls_token
+  if tokenizer.eos_token is None:
+    if tokenizer.sep_token is None:
+      raise AttributeError(
+        'Tokenizer must have a eos_token '
+        f'or sep_token: {tokenizer}')
+    tokenizer.eos_token = tokenizer.sep_token
+  if tokenizer.pad_token is None and not config.is_vision:
+    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+  return tokenizer
+def get_dataloaders(config, tokenizer, skip_train=False,
+                    skip_valid=False, valid_seed=None):
+  num_gpus = torch.cuda.device_count()
+  assert (config.loader.global_batch_size
+          == (config.loader.batch_size
+              * config.trainer.num_nodes
+              * num_gpus
+              * config.trainer.accumulate_grad_batches))
+  if config.loader.global_batch_size % (
+    num_gpus * config.trainer.accumulate_grad_batches) != 0:
+    raise ValueError(
+      f'Train Batch Size {config.training.batch_size}'
+      f'not divisible by {num_gpus} gpus with accumulation '
+      f'{config.trainer.accumulate_grad_batches}.')
+  if config.loader.eval_global_batch_size % num_gpus != 0:
+    raise ValueError(
+      f'Eval Batch Size for {config.eval.batch_size} '
+      f'not divisible by {num_gpus}.')
+  label_col = getattr(config.data, 'label_col', None)
+  if skip_train:
+    train_set = None
+  else:
+    if 'cifar10' in config.data.train:
+      train_set = custom_datasets.discretized_cifar10.DiscreteCIFAR10(
+        config.data.train, train=True, download=True)
+    else:
+      train_set = get_dataset(
+        config.data.train,
+        tokenizer,
+        mode='train',
+        wrap=config.data.wrap,
+        cache_dir=config.data.cache_dir,
+        block_size=config.model.length,
+        override_cache=config.data.override_cache,
+        add_special_tokens=config.data.add_special_tokens,
+        label_col=label_col,
+        label_threshold=getattr(config.data,
+                                'label_col_pctile', None))
+  if config.data.valid in [
+    'text8', 'lm1b', 'amazon_polarity', 'qm9',
+    'ten_species']:
+    validation_split = 'test'
+  else:
+    validation_split = 'validation'
+  if skip_valid:
+    valid_set = None
+  else:
+    if 'cifar10' in config.data.train:
+      valid_set = custom_datasets.discretized_cifar10.DiscreteCIFAR10(
+        config.data.valid, train=False, download=True)
+    else:
+      valid_set = get_dataset(
+        config.data.valid,
+        tokenizer,
+        wrap=config.data.wrap,
+        mode=validation_split,
+        cache_dir=config.data.cache_dir,
+        block_size=config.model.length,
+        streaming=False,
+        override_cache=config.data.override_cache,
+        add_special_tokens=config.data.add_special_tokens,
+        label_col=label_col,
+        label_threshold=getattr(config.data,
+                                'label_col_pctile', None))
+  if skip_train:
+    train_loader = None
+  else:
+    train_loader = torch.utils.data.DataLoader(
+      train_set,
+      batch_size=config.loader.batch_size,
+      num_workers=config.loader.num_workers,
+      pin_memory=config.loader.pin_memory,
+      shuffle=not config.data.streaming,
+      persistent_workers=config.loader.persistent_workers
+    )
+    train_loader.tokenizer = tokenizer
+  if skip_valid:
+    valid_loader = None
+  else:
+    if valid_seed is None:
+      shuffle_valid = False
+      generator = None
+    else:
+      shuffle_valid = True
+      generator = torch.Generator().manual_seed(valid_seed)
+    valid_loader = torch.utils.data.DataLoader(
+      valid_set,
+      batch_size=config.loader.eval_batch_size,
+      num_workers=config.loader.num_workers,
+      pin_memory=config.loader.pin_memory,
+      shuffle=shuffle_valid,
+      generator=generator)
+    # Will be used in generative perplexity calculation
+    valid_loader.tokenizer = tokenizer
+  return train_loader, valid_loader
+# Samplers adapted from: https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/fault_tolerant_sampler.py
+class RandomFaultTolerantSampler(torch.utils.data.RandomSampler):
+  def __init__(self, *args, generator=None, **kwargs):
+    # TD [2022-07-17]: We don't force the seed to be zero. We generate random seed,
+    # which should be reproducible if pl.seed_everything was called beforehand.
+    # This means that changing the seed of the experiment will also change the
+    # sampling order.
+    if generator is None:
+      seed = int(torch.empty((), dtype=torch.int64).random_().item())
+      generator = torch.Generator().manual_seed(seed)
+    kwargs.pop('shuffle', None)
+    super().__init__(*args, generator=generator, **kwargs)
+    self.counter = 0
+    self.restarting = False
+  def state_dict(self):
+    return {'random_state': self.generator.get_state(),
+            'counter': self.counter}
+  def load_state_dict(self, state_dict):
+    self.generator.set_state(state_dict.get('random_state'))
+    self.counter = state_dict['counter']
+    # self.start_counter = self.counter
+    self.restarting = True
+  # TD [2022-08-28] Setting the len will cause PL to think there are only a few batches left per
+  # epoch, and subsequent epoch will have very few batches.
+  def __iter__(self) -> typing.Iterator[int]:
+    n = len(self.data_source)
+    self.state = self.generator.get_state()
+    indices = torch.randperm(n, generator=self.generator).tolist()
+    if not self.restarting:
+      self.counter = 0
+    else:
+      indices = indices[self.counter:]
+      self.restarting = False
+    for index in indices:
+      self.counter += 1
+      yield index
+    self.counter = 0
+class FaultTolerantDistributedSampler(torch.utils.data.DistributedSampler):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.counter = 0
+    self.restarting = False
+  def state_dict(self):
+    return {'epoch': self.epoch, 'counter': self.counter}
+  def load_state_dict(self, state_dict):
+    self.epoch = state_dict['epoch']
+    self.counter = state_dict['counter']
+    self.restarting = True
+  # TD [2022-08-28] Setting the len will cause PL to think there are only a few batches left per
+  # epoch, and subsequent epoch will have very few batches.
+  def __iter__(self):
+    if self.shuffle:
+      # deterministically shuffle based on epoch and seed
+      g = torch.Generator()
+      g.manual_seed(self.seed + self.epoch)
+      indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+    else:
+      indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+    if not self.drop_last:
+      # add extra samples to make it evenly divisible
+      padding_size = self.total_size - len(indices)
+      if padding_size <= len(indices):
+        indices += indices[:padding_size]
+      else:
+        indices += (indices * math.ceil(
+          padding_size / len(indices)))[:padding_size]
+    else:
+      # remove tail of data to make it evenly divisible.
+      indices = indices[:self.total_size]
+    assert len(indices) == self.total_size
+    # subsample
+    indices = indices[self.rank:self.total_size:self.num_replicas]
+    assert len(indices) == self.num_samples
+    if not self.restarting:
+      self.counter = 0
+    else:
+      indices = indices[self.counter:]
+      self.restarting = False
+    for index in indices:
+      self.counter += 1
+      yield index
+    self.counter = 0
+def collate_fn(batch):
+  input_ids = torch.tensor(batch[0]['input_ids'])
+  attention_mask = torch.tensor(batch[0]['attention_mask'])
+  return {
+    'input_ids': input_ids,
+    'attention_mask': attention_mask
+  }
+class CustomDataModule(L.LightningDataModule):
+    def __init__(self, train_dataset, val_dataset, test_dataset, tokenizer, config, batch_size: int=8, collate_fn=collate_fn):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.collate_fn = collate_fn
+        self.config = config
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          collate_fn=partial(self.collate_fn),
+                          num_workers=self.config.loader.num_workers,
+                          pin_memory=self.config.loader.pin_memory,
+                          shuffle=not self.config.data.streaming,
+                          persistent_workers=self.config.loader.persistent_workers)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          collate_fn=partial(self.collate_fn),
+                          num_workers=self.config.loader.num_workers,
+                          pin_memory=self.config.loader.pin_memory,
+                          shuffle=False)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset,
+                          collate_fn=partial(self.collate_fn),
+                          num_workers=self.config.loader.num_workers,
+                          pin_memory=self.config.loader.pin_memory,
+                          shuffle=not self.config.data.streaming)

diffusion.py ADDED Viewed

	@@ -0,0 +1,1629 @@

+"""Module for modeling discrete diffusion
+  (absorbing state or uniform) and AR
+  (a special case of absorbing state).
+"""
+import itertools
+import math
+import typing
+from dataclasses import dataclass
+import hydra.utils
+import lightning as L
+import numpy as np
+import omegaconf
+import torch
+import torch.nn.functional as F
+import torchmetrics
+import transformers
+from mamba_ssm.utils.generation import InferenceParams
+from torch import Tensor
+from tqdm.auto import tqdm
+import pdb
+import gc
+import classifier
+import dataloader
+import models
+import noise_schedule
+LOG2 = math.log(2)
+def _sample_categorical(categorical_probs):
+  gumbel_norm = (
+    1e-10
+    - (torch.rand_like(categorical_probs) + 1e-10).log()).to(categorical_probs.dtype)
+  return (categorical_probs / gumbel_norm).argmax(dim=-1)
+def _unsqueeze(x, reference):
+  return x.view(
+    * x.shape,
+    * ((1,) * (len(reference.shape) - len(x.shape))))
+@dataclass
+class Loss:
+  loss: torch.FloatTensor
+  nlls: torch.FloatTensor
+  token_mask: torch.FloatTensor
+  recon_loss: typing.Optional[torch.FloatTensor] = None
+  diffusion_loss: typing.Optional[torch.FloatTensor] = None
+class NLL(torchmetrics.aggregation.MeanMetric):
+  pass
+class BPD(NLL):
+  def compute(self) -> Tensor:
+    """Computes the bits per dimension.
+    Returns:
+      bpd
+    """
+    return self.mean_value / self.weight / LOG2
+class Perplexity(NLL):
+  def compute(self) -> Tensor:
+    """Computes the Perplexity.
+    Returns:
+     Perplexity
+    """
+    return torch.exp(self.mean_value / self.weight)
+class Diffusion(L.LightningModule):
+  def __init__(
+    self,
+    config,
+    tokenizer: transformers.PreTrainedTokenizer):
+    super().__init__()
+    self.save_hyperparameters()
+    self.config = config
+    self.tokenizer = tokenizer
+    self.vocab_size = tokenizer.vocab_size
+    self.antithetic_sampling = config.training.antithetic_sampling
+    self.importance_sampling = config.training.importance_sampling
+    self.change_of_variables = config.training.change_of_variables
+    self.noise = noise_schedule.get_noise(config, dtype=self.dtype)
+    if self.config.is_vision:
+        self.mask_index = getattr(tokenizer, 'mask_token_id', -1)
+    else:
+      if (not hasattr(self.tokenizer, 'mask_token')
+          or tokenizer.mask_token is None):
+        self.mask_index = self.vocab_size
+        self.vocab_size += 1
+      else:
+        self.mask_index = tokenizer.mask_token_id
+    # Note: creating limiting distribution with
+    #  broadcast-able batch and sequence dimensions.
+    self.parameterization = config.parameterization
+    self.diffusion = config.diffusion
+    if config.parameterization == 'ar':
+      self.limiting_distribution = None
+    else:
+      if self.diffusion == 'absorbing_state':
+        # Not needed, posterior calculated explicitly.
+        limiting_distribution = None
+      elif self.diffusion == 'uniform':
+        limiting_distribution = torch.ones(
+          (1, 1, self.vocab_size), dtype=self.dtype) / self.vocab_size
+      else:
+        raise NotImplementedError(
+          f"Diffusion type {self.diffusion} not implemented.")
+      self.register_buffer('limiting_distribution',
+                           limiting_distribution)
+    self.T = config.T
+    self.subs_masking = config.subs_masking
+    self.time_conditioning = config.time_conditioning
+    if self.config.backbone == 'dit':
+      self.backbone = models.dit.DIT(
+        self.config, vocab_size=self.vocab_size)
+    elif self.config.backbone == 'dimamba':
+      self.backbone = models.dimamba.DiMamba(
+        self.config, vocab_size=self.vocab_size,
+        pad_token_id=self.tokenizer.pad_token_id)
+    elif self.config.backbone == 'unet':
+      self.backbone = models.unet.UNet(
+        self.config, vocab_size=self.vocab_size)
+    elif self.config.backbone == 'hf_dit':
+      self.backbone = transformers.AutoModelForMaskedLM.from_pretrained(
+        config.model.pretrained_model_name_or_path, trust_remote_code=True)
+    else:
+      raise NotImplementedError(
+        f"Backbone {self.config.backbone} not implemented.")
+    self.lr = self.config.optim.lr
+    self.sampling_eps = config.training.sampling_eps
+    self.softplus = torch.nn.Softplus()
+    self.neg_infinity = -1_000_000.0
+    if config.training.ema > 0:
+      self.ema = models.ema.ExponentialMovingAverage(
+        itertools.chain(self.backbone.parameters(),
+                        self.noise.parameters()),
+        decay=config.training.ema)
+    else:
+      self.ema = None
+    # metrics are automatically reset at end of epoch
+    metrics = torchmetrics.MetricCollection({
+      'nll': NLL(),
+      'bpd': BPD(),
+      'ppl': Perplexity(),
+    })
+    metrics.set_dtype(torch.float64)
+    self.train_metrics = metrics.clone(prefix='train/')
+    self.valid_metrics = metrics.clone(prefix='val/')
+    self.test_metrics = metrics.clone(prefix='test/')
+    self.fast_forward_epochs = None
+    self.fast_forward_batches = None
+    self._validate_configuration()
+  def _validate_configuration(self):
+    assert not (self.change_of_variables
+                and self.importance_sampling)
+    if self.diffusion != 'absorbing_state':
+      assert self.parameterization not in {'ar', 'subs'}
+    if self.T > 0:
+      assert self.parameterization in {'d3pm', 'subs'}
+    if self.subs_masking:
+      assert self.parameterization == 'd3pm'
+  def on_load_checkpoint(self, checkpoint):
+    if self.limiting_distribution is not None:
+      checkpoint['state_dict']['limiting_distribution'] = self.limiting_distribution.to(
+        list(checkpoint['state_dict'].values())[0].device)
+    if self.ema:
+      self.ema.load_state_dict(checkpoint['ema'])
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
+    self.fast_forward_epochs = checkpoint['loops'][
+      'fit_loop']['epoch_progress']['current']['completed']
+    self.fast_forward_batches = checkpoint['loops'][
+      'fit_loop']['epoch_loop.batch_progress'][
+        'current']['completed']
+  def on_save_checkpoint(self, checkpoint):
+    # Do not save this buffer
+    checkpoint['state_dict'].pop('limiting_distribution',
+                                 None)
+    if self.ema:
+      checkpoint['ema'] = self.ema.state_dict()
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py
+    # ['epoch_loop.batch_progress']['total']['completed'] is
+    #  1 iteration behind, so we're using the optimizer's
+    #  progress.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['total'][
+        'completed'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['total'][
+              'completed'] * self.trainer.accumulate_grad_batches
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['current'][
+        'completed'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['current'][
+              'completed'] * self.trainer.accumulate_grad_batches
+    # _batches_that_stepped tracks the number of global
+    # steps, not the number of local steps, so we don't
+    # multiply with self.trainer.accumulate_grad_batches
+    # here.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.state_dict'][
+        '_batches_that_stepped'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['total']['completed']
+    if 'sampler' not in checkpoint.keys():
+      checkpoint['sampler'] = {}
+    if hasattr(self.trainer.train_dataloader.sampler,
+               'state_dict'):
+      sampler_state_dict = self.trainer.\
+        train_dataloader.sampler.state_dict()
+      checkpoint['sampler'][
+        'random_state'] = sampler_state_dict.get(
+          'random_state', None)
+    else:
+      checkpoint['sampler']['random_state'] = None
+  def on_train_start(self):
+    if self.ema:
+      self.ema.move_shadow_params_to_device(self.device)
+    # Adapted from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+    distributed = (
+      self.trainer._accelerator_connector.use_distributed_sampler
+      and self.trainer._accelerator_connector.is_distributed)
+    if distributed:
+      sampler_cls = dataloader.FaultTolerantDistributedSampler
+    else:
+      sampler_cls = dataloader.RandomFaultTolerantSampler
+    updated_dls = []
+    for dl in self.trainer.fit_loop._combined_loader.flattened:
+      if hasattr(dl.sampler, 'shuffle'):
+        dl_sampler = sampler_cls(
+          dl.dataset, shuffle=dl.sampler.shuffle)
+      else:
+        dl_sampler = sampler_cls(dl.dataset)
+      if (distributed
+          and self.fast_forward_epochs is not None
+          and self.fast_forward_batches is not None):
+        dl_sampler.load_state_dict({
+          'epoch': self.fast_forward_epochs,
+          'counter': (self.fast_forward_batches
+                      * self.config.loader.batch_size)})
+      from functools import partial
+      from dataloader import collate_fn
+      collate_partial = partial(collate_fn)
+      torch.cuda.empty_cache()
+      updated_dls.append(
+        torch.utils.data.DataLoader(
+          dl.dataset,
+          # batch_size=self.config.loader.batch_size,
+          num_workers=self.config.loader.num_workers,
+          pin_memory=self.config.loader.pin_memory,
+          # sampler=dl_sampler,
+          shuffle=False,
+          persistent_workers=self.config.loader.persistent_workers,
+          collate_fn=collate_partial
+        ))
+    self.trainer.fit_loop._combined_loader.flattened = updated_dls
+  def configure_optimizers(self):
+    # TODO(yair): Lightning currently giving this warning when using `fp16`:
+    #  "Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+    #  Not clear if this is a problem or not.
+    #  See: https://github.com/Lightning-AI/pytorch-lightning/issues/5558
+    optimizer = torch.optim.AdamW(
+      itertools.chain(self.backbone.parameters(),
+                      self.noise.parameters()),
+      lr=self.config.optim.lr,
+      betas=(self.config.optim.beta1,
+             self.config.optim.beta2),
+      eps=self.config.optim.eps,
+      weight_decay=self.config.optim.weight_decay)
+    scheduler = hydra.utils.instantiate(
+      self.config.lr_scheduler, optimizer=optimizer)
+    scheduler_dict = {
+      'scheduler': scheduler,
+      'interval': 'step',
+      'monitor': 'val/loss',
+      'name': 'trainer/lr',
+    }
+    return [optimizer], [scheduler_dict]
+  def optimizer_step(self, *args, **kwargs):
+    super().optimizer_step(*args, **kwargs)
+    if self.ema:
+      self.ema.update(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+  def _subs_parameterization(self, logits, xt):
+    # "Zero Masking Prob":
+    # log prob at the mask index = - infinity
+    logits[..., self.mask_index] += self.neg_infinity
+    # "Copy over":
+    # Apply updates directly in the logits matrix.
+    # For the logits of the unmasked tokens, set all values
+    # to -infinity except for the indices corresponding to
+    # the unmasked tokens.
+    unmasked_indices = (xt != self.mask_index)
+    logits[unmasked_indices] = self.neg_infinity
+    logits[unmasked_indices, xt[unmasked_indices]] = 0
+    # Normalize the logits such that x.exp() is
+    # a probability distribution over vocab_size.
+    return logits.log_softmax(dim=-1)
+  def _process_sigma(self, sigma):
+    if sigma is None:
+      assert self.parameterization == 'ar'
+      return sigma
+    if sigma.ndim > 1:
+      sigma = sigma.squeeze(-1)
+    if not self.time_conditioning:
+      sigma = torch.zeros_like(sigma)
+    assert sigma.ndim == 1, sigma.shape
+    return sigma
+  def forward(self, x, sigma, cond=None, x_emb=None, **kwargs):
+    """Returns log_probs / logits."""
+    sigma = self._process_sigma(sigma)
+    with torch.cuda.amp.autocast(dtype=torch.float32):
+      logits = self.backbone(x, sigma, cond, x_emb=x_emb, **kwargs)
+    if self.parameterization == 'subs':
+      # returns log_probs
+      return self._subs_parameterization(
+        logits=logits, xt=x)
+    if self.parameterization in {'ar', 'd3pm'}:
+      # returns log_probs
+      if self.subs_masking:  # Can use "zero masking prob"
+        logits[:, :, self.mask_index] += self.neg_infinity
+      return logits.log_softmax(dim=-1)
+    return logits
+  def _compute_posterior(self, x, xt, alpha_s, alpha_t):
+    """Computes the posterior / approximate posterior.
+    Args:
+      x: Either clean input `x0` (one-hot),
+        or model's predicted `x_theta` of shape (B, L, V).
+      xt: The noisy latent (as indices) of shape (B, L).
+      alpha_s: Noise level at s of shape (B, [L | 1], 1).
+      alpha_t: Noise level at t of shape (B, [L | 1], 1).
+    Returns:
+      Posterior / approximate posterior of shape (B, L, V).
+    """
+    alpha_ts = alpha_t / alpha_s
+    d_alpha = alpha_s - alpha_t
+    xt_one_hot = F.one_hot(xt, self.vocab_size)
+    if self.diffusion == 'uniform':
+      return (
+        (alpha_t * self.vocab_size * x * xt_one_hot +
+         (alpha_ts - alpha_t) * xt_one_hot +
+         d_alpha * x +
+         (1 - alpha_ts) * (1 - alpha_s) * self.limiting_distribution)
+        /
+        (alpha_t * self.vocab_size * torch.gather(x, -1, xt[..., None]) +
+         (1 - alpha_t))
+      )
+    raise NotImplementedError(
+      f"Diffusion type {self.diffusion} not implemented.")
+  def _d3pm_loss(self, model_output, xt, x0, t):
+    assert self.config.noise.type == 'loglinear', (
+      'D3PM loss only implemented for log-linear noise.')
+    dt = 1 / self.T
+    if torch.is_tensor(t):
+      t = t[:, None]
+      assert t.ndim == 2
+      t = t.clamp(0., 1. - 1e-4)
+    alpha_t = 1 - t + torch.zeros_like(xt)
+    alpha_s = 1 - (t - dt) + torch.zeros_like(xt)
+    if self.diffusion == 'absorbing_state':
+      log_x_theta_at_x0 = torch.gather(
+        model_output, -1, x0[:, :, None]).squeeze(-1)
+      log_x_theta_at_m = model_output[:, :, self.mask_index]
+      x_theta_at_m = log_x_theta_at_m.exp()
+      term_1_coef = dt / t
+      term_1_log_nr = torch.log(alpha_t * x_theta_at_m / t + 1)
+      term_1_log_dr = log_x_theta_at_x0
+      term_2_coef = 1 - dt / t
+      term_2_log_nr = term_1_log_nr
+      term_2_log_dr = torch.log(alpha_s * x_theta_at_m / (t - dt) + 1)
+      L_vb_masked = (
+        term_1_coef * (term_1_log_nr - term_1_log_dr)
+        + term_2_coef * (term_2_log_nr - term_2_log_dr))
+      L_vb = L_vb_masked * (xt == self.mask_index)
+    elif self.diffusion == 'uniform':
+      posterior = self._compute_posterior(
+        x=F.one_hot(x0, num_classes=self.vocab_size).to(self.dtype),
+        xt=xt,
+        alpha_s=alpha_s[..., None],
+        alpha_t=alpha_t[..., None])
+      posterior_pred = self._compute_posterior(
+        x=model_output.exp(),
+        xt=xt,
+        alpha_s=alpha_s[..., None],
+        alpha_t=alpha_t[..., None])
+      L_vb = (
+          posterior * (torch.log(posterior + 1e-12) - torch.log(posterior_pred))
+      ).sum(dim=-1)
+    else:
+      raise NotImplementedError(
+        f"Diffusion type {self.diffusion} not implemented for D3PM.")
+    return self.T * L_vb
+  def _reconstruction_loss(self, x0, cond=None):
+    # For D3PM parameterization
+    assert self.config.noise.type == 'loglinear', (
+      'Reconstruction loss only implemented for log-linear '
+      'noise.')
+    t0 = torch.zeros(x0.shape[0], dtype=self.dtype,
+                     device=self.device)
+    time_conditioning = self.noise(t0)[0][:, None]
+    model_output_t0 = self.forward(x0, time_conditioning,
+                                   cond=cond)
+    return - torch.gather(input=model_output_t0,
+                          dim=-1,
+                          index=x0[:, :, None]).squeeze(-1)
+  def _sample_t(self, n):
+    _eps_t = torch.rand(n, device=self.device)
+    if self.antithetic_sampling:
+      offset = torch.arange(n, device=self.device) / n
+      _eps_t = (_eps_t / n + offset) % 1
+    t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
+    if self.importance_sampling:
+      return self.noise.importance_sampling_transformation(
+        t)
+    return t
+  def _q_xt(self, x, move_chance):
+    """Computes the noisy sample xt.
+    Args:
+      x: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      move_chance: float torch.Tensor with shape
+        (batch_size, 1).
+    """
+    move_indices = torch.rand(
+      *x.shape, device=x.device) < move_chance
+    if self.diffusion == 'absorbing_state':
+      return torch.where(move_indices, self.mask_index, x)
+    if self.diffusion == 'uniform':
+      uniform_tensor = torch.randint(
+        0, self.vocab_size, x.shape, device=x.device)
+      return torch.where(move_indices, uniform_tensor, x)
+    elif self.diffusion == 'uniform_data_marginals':
+      return torch.where(
+        move_indices,
+        self._sample_prior(*x.shape),
+        x)
+    raise NotImplementedError(
+      f"Diffusion type {self.diffusion} not implemented.")
+  def _forward_pass_diffusion(self, x0, cond=None):
+    t = self._sample_t(x0.shape[0])
+    if self.T > 0:
+      t = (t * self.T).to(torch.int)
+      t = t / self.T
+      # t \in {1/T, 2/T, ..., 1}
+      t += (1 / self.T)
+    if self.change_of_variables:
+      time_conditioning = t[:, None]
+      f_T = torch.log1p(- torch.exp(- self.noise.sigma_max))
+      f_0 = torch.log1p(- torch.exp(- self.noise.sigma_min))
+      move_chance = torch.exp(f_0 + t * (f_T - f_0))
+      move_chance = move_chance[:, None]
+      sigma, dsigma = None, None
+    else:
+      sigma, dsigma = self.noise(t)
+      time_conditioning = sigma[:, None]
+      move_chance = 1 - torch.exp(-sigma[:, None])
+    xt = self._q_xt(x0, move_chance)
+    model_output = self.forward(xt, time_conditioning,
+                                cond=cond)
+    # Discrete (finite T) time
+    if self.T > 0:
+      diffusion_loss = self._d3pm_loss(
+        model_output=model_output, xt=xt, x0=x0, t=t)
+      if self.parameterization == 'd3pm':
+        reconstruction_loss = self._reconstruction_loss(
+          x0, cond=cond)
+        if self.training and self.config.training.use_simple_ce_loss:
+          loss = -torch.gather(
+            input=model_output,
+            dim=-1,
+            index=x0[:, :, None]).squeeze(-1)
+        else:
+          loss = reconstruction_loss + diffusion_loss
+        return {
+          'recon_loss': reconstruction_loss,
+          'diffusion_loss': diffusion_loss,
+          'loss': loss}
+      elif self.parameterization == 'subs':
+        if self.training and self.config.training.use_simple_ce_loss:
+          loss = -torch.gather(
+            input=model_output,
+            dim=-1,
+            index=x0[:, :, None]).squeeze(-1)
+        else:
+          loss = diffusion_loss
+        return {'diffusion_loss': diffusion_loss, 'loss': loss}
+      else:
+        raise ValueError(
+          f"Invalid parameterization: {self.parameterization} for T > 0.")
+    # Continuous (T --> infty) time
+    if self.diffusion == 'absorbing_state':
+      # SUBS parameterization, continuous time.
+      log_p_theta = torch.gather(
+        input=model_output,
+        dim=-1,
+        index=x0[:, :, None]).squeeze(-1)
+      if self.change_of_variables or self.importance_sampling:
+        if self.training and self.config.training.use_simple_ce_loss:
+          return {
+          'diffusion_loss': log_p_theta * torch.log1p(-torch.exp(- self.noise.sigma_min)),
+          'loss': -log_p_theta
+          }
+        return log_p_theta * torch.log1p(-torch.exp(- self.noise.sigma_min))
+      if self.training and self.config.training.use_simple_ce_loss:
+        return {
+          'diffusion_loss': log_p_theta * (dsigma / torch.expm1(sigma))[:, None],
+          'loss': log_p_theta
+        }
+      return - log_p_theta * (dsigma / torch.expm1(sigma))[:, None]
+    elif self.diffusion == 'uniform':
+      assert self.config.noise.type == 'loglinear', (
+        'Continuous time uniform diffusion only implemented'
+        ' for log-linear noise.')
+      # TODO: Currently α_t' and α_t are hardcoded to a
+      #  log-linear noise.
+      #  Make generic (as above, for absorbing state):
+      #    alpha_t_prime =  -dsigma * (-sigma).exp()
+      #    alpha_t = (-sigma).exp()
+      alpha_t_prime = -1.
+      alpha_t = 1. - t[..., None, None]  # B, 1, 1
+      # x_bar = N * α_t * x + 1 - α_t ; B, L, V
+      x_bar = self.vocab_size * alpha_t * F.one_hot(x0, self.vocab_size).float() + 1 - alpha_t
+      x_bar_theta = self.vocab_size * alpha_t * model_output.exp() + 1 - alpha_t
+      # α_t' / (N*α_t)
+      coeff = alpha_t_prime / (self.vocab_size * alpha_t)  # B, 1, 1
+      # Term 1: indices where z_t = 1
+      x_bar_zt = torch.gather(x_bar, -1, xt[..., None])  # B, L, 1
+      x_bar_theta_zt = torch.gather(x_bar_theta, -1, xt[..., None])  # B, L, 1
+      term1 = ((self.vocab_size / x_bar_zt) - (self.vocab_size / x_bar_theta_zt))  # B, L, 1
+      # Term 2: indices where z_t = 0
+      term2 = (  # B, L, V before summing --> B, L, 1 after
+          (x_bar / x_bar_zt) *
+          (
+              x_bar_theta_zt.log() - x_bar_theta.log() +
+              x_bar.log() - x_bar_zt.log()
+          )
+      )
+      term2 = term2.sum(dim=-1, keepdim=True)  # B, L, 1
+      diffusion_loss = (coeff * (term1 - term2)).squeeze()  # B, L
+      reconstruction_loss = self._reconstruction_loss(
+        x0, cond=cond)
+      if self.training and self.config.training.use_simple_ce_loss:
+        return {
+          'recon_loss': reconstruction_loss,
+          'diffusion_loss': diffusion_loss,
+          'loss': -torch.gather(
+            input=model_output,
+            dim=-1,
+            index=x0[:, :, None]).squeeze(-1)
+        }
+      return {
+        'recon_loss': reconstruction_loss,
+        'diffusion_loss': diffusion_loss,
+        'loss': diffusion_loss if getattr(self.config, 'zero_recon_loss', False)
+                else diffusion_loss + reconstruction_loss
+      }
+    else:
+      raise NotImplementedError(
+        f"Diffusion type {self.diffusion} not "
+        "implemented for continuous time case.")
+  def _maybe_sub_sample(self, x0, attention_mask):
+    seqlen = x0.shape[1]
+    # if seqlen > self.config.model.length:
+    #   assert seqlen == 2 * self.config.model.length
+    #   # cropping is necessary for the text8-crop dataset;
+    #   # try the same starting point for now
+    #   start = np.random.choice(self.config.model.length)
+    #   end = start + self.config.model.length
+    #   input_tokens = x0[:, start: end]
+    #   output_tokens = x0[:, start + 1: end + 1]
+    #   new_attention_mask = attention_mask[:, start: end]
+    #   # Helps with validation PPL, since the val
+    #   # examples will all start and end with BOS/EOS
+    #   input_tokens[:, 0] = self.tokenizer.bos_token_id
+    #   output_tokens[:, -1] = self.tokenizer.eos_token_id
+    # elif self.parameterization == 'ar':
+    #   input_tokens = x0[:, :-1]
+    #   output_tokens = x0[:, 1:]
+    #   new_attention_mask = attention_mask[:, 1:]
+    # else:
+    #   input_tokens = x0
+    #   output_tokens = None
+    #   new_attention_mask = attention_mask
+    input_tokens = x0
+    output_tokens = None
+    new_attention_mask = attention_mask
+    return input_tokens, output_tokens, new_attention_mask
+  def _loss(self, x0, attention_mask, cond=None):
+    (input_tokens, output_tokens,
+     attention_mask) = self._maybe_sub_sample(
+      x0, attention_mask)
+    recon_loss, diffusion_loss = None, None
+    if (cond is not None and self.training
+        and self.config.training.guidance is not None
+        and self.config.training.guidance.cond_dropout > 0):
+      # Randomly mask out conditioning for classifier-free
+      # guidance training.
+      p = torch.bernoulli(
+        torch.ones_like(cond) *
+        self.config.training.guidance.cond_dropout).to(torch.bool)
+      # Use num_classes index as conditioning mask_token_id
+      cond[p] = self.config.data.num_classes
+    if self.parameterization == 'ar':
+      logprobs = self.forward(
+        input_tokens, sigma=None, cond=cond)
+      loss = - logprobs.gather(
+        -1, output_tokens[:, :, None])[:, :, 0]
+    else:
+      loss = self._forward_pass_diffusion(input_tokens,
+                                          cond=cond)
+      if isinstance(loss, dict):
+        recon_loss = loss['recon_loss']
+        diffusion_loss = loss['diffusion_loss']
+        loss = loss['loss']
+    nlls = loss * attention_mask
+    count = attention_mask.sum()
+    if (self.config.training.compute_loss_on_pad_tokens
+        and self.training):
+      token_nll = loss.mean()
+    else:
+      batch_nll = nlls.sum()
+      token_nll = batch_nll / count
+    if recon_loss is not None and diffusion_loss is not None:
+      with torch.no_grad():
+        recon_loss_batch = (recon_loss * attention_mask).sum() / count
+        diffusion_loss_batch = (diffusion_loss * attention_mask).sum() / count
+      return Loss(loss=token_nll,
+                  nlls=nlls,
+                  token_mask=attention_mask,
+                  recon_loss=recon_loss_batch,
+                  diffusion_loss=diffusion_loss_batch)
+    return Loss(loss=token_nll,
+                nlls=nlls,
+                token_mask=attention_mask)
+  def _compute_loss(self, batch, prefix):
+    if 'attention_mask' in batch:
+      attention_mask = batch['attention_mask']
+    else:
+      attention_mask = None
+    cond = None
+    if (self.config.training.guidance is not None or  # Training for / using CFG
+        (hasattr(self.config, 'guidance')
+         and self.config.guidance is not None
+         and self.config.guidance.method == 'cfg')):
+      if self.config.data.label_col in batch:
+        cond = batch[self.config.data.label_col]
+      elif f"{self.config.data.label_col}_threshold" in batch:
+        cond = batch[f"{self.config.data.label_col}_threshold"]
+      else:
+        raise RuntimeError(
+          f"Conditioning {self.config.data.label_col}"
+          f" not found in batch.")
+    losses = self._loss(batch['input_ids'], attention_mask,
+                        cond=cond)
+    if prefix == 'train':
+      self.train_metrics.update(losses.nlls,
+                                losses.token_mask)
+      metrics = self.train_metrics
+    elif prefix == 'val':
+      self.valid_metrics.update(losses.nlls,
+                                losses.token_mask)
+      metrics = self.valid_metrics
+    elif prefix == 'test':
+      self.test_metrics.update(losses.nlls,
+                               losses.token_mask)
+      metrics = self.test_metrics
+    else:
+      raise ValueError(f"Invalid prefix: {prefix}")
+    self.log_dict(metrics,
+                  on_step=False,
+                  on_epoch=True,
+                  sync_dist=True)
+    return losses
+  def training_step(self, batch, batch_idx):
+    losses = self._compute_loss(batch, prefix='train')
+    self.log(name='trainer/loss',
+             value=losses.loss.item(),
+             on_step=True,
+             on_epoch=True,
+             sync_dist=True,
+             prog_bar=True)
+    if losses.recon_loss is not None:
+      self.log(name='trainer/recon_loss',
+               value=losses.recon_loss.item(),
+               on_step=True,
+               on_epoch=True,
+               sync_dist=True,
+               prog_bar=False)
+      self.log(name='trainer/diffusion_loss',
+               value=losses.diffusion_loss.item(),
+               on_step=True,
+               on_epoch=True,
+               sync_dist=True,
+               prog_bar=False)
+    self.log(name='lr',
+             value=self.trainer.optimizers[0].param_groups[0]['lr'],
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True,
+             prog_bar=True, logger=False)
+    return losses.loss
+  def validation_step(self, batch, batch_idx):
+    losses = self._compute_loss(batch, prefix='val')
+    self.log(name='trainer/val_loss',
+             value=losses.loss.item(),
+             on_step=True,
+             on_epoch=True,
+             prog_bar=True,
+             sync_dist=True)
+    return losses.loss
+  def load_ema_params(self):
+    if self.ema:
+      self.ema.store(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+      self.ema.copy_to(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+  def _restore_non_ema_params(self):
+    if self.ema:
+      self.ema.restore(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+  def on_validation_epoch_start(self):
+    # pdb.set_trace()
+    gc.collect()
+    torch.cuda.empty_cache()
+    self.load_ema_params()
+    assert self.valid_metrics.nll.mean_value == 0
+    assert self.valid_metrics.nll.weight == 0
+  def on_validation_epoch_end(self):
+    # pdb.set_trace()
+    # self._restore_non_ema_params()
+    # if (not self.trainer.sanity_checking
+    #     and self.config.eval.generate_samples
+    #     and self.trainer.global_rank == 0):
+    #   self.config.sampling.batch_size = 1
+    #   if self.config.is_vision:
+    #     samples = []
+    #     if self.config.training.guidance is not None:
+    #       # Generate one image per class (up to 10 images)
+    #       guidance = {
+    #         'method': 'cfg', 'condition': 0, 'gamma': 1.0}
+    #       omegaconf.OmegaConf.update(
+    #         self.config, key='guidance', value=guidance,
+    #         force_add=True)
+    #       for i in range(max(self.config.data.num_classes, 10)):
+    #         self.config.guidance.condition = i
+    #         samples.append(self.sample())
+    #     else:
+    #       # Generate ten images
+    #       for i in range(10):
+    #         samples.append(self.sample())
+    #     image_samples = self.tokenizer.batch_decode(
+    #       torch.concat(samples, dim=0))
+    #     if hasattr(self.trainer.logger, 'log_image'):
+    #       self.trainer.logger.log_image(
+    #         key=f"samples@global_step{self.global_step}",
+    #         caption=[str(i) for i in range(len(samples))],
+    #         images=[s for s in image_samples.float()])
+    #   else:
+    #     if self.config.training.guidance is not None:
+    #       guidance = {
+    #         'method': 'cfg', 'condition': 0, 'gamma': 1.0}
+    #       omegaconf.OmegaConf.update(
+    #         self.config, key='guidance', value=guidance,
+    #         force_add=True)
+    #       for i in range(self.config.data.num_classes):
+    #         self.config.guidance.condition = i
+    #         samples = self.sample()
+    #         decoded_samples = self.tokenizer.batch_decode(
+    #           samples)
+    #         if hasattr(self.trainer.logger, 'log_table'):
+    #           # Log some generated samples
+    #           self.trainer.logger.log_table(
+    #             key=f"samples@global_step{self.global_step}_class-{i}",
+    #             columns=['Generated Samples'],
+    #             data=[decoded_samples])
+    #     else:
+    #       self.config.sampling.batch_size = 2
+    #       samples = self.sample()
+    #       decoded_samples = self.tokenizer.batch_decode(
+    #         samples)
+    #       if hasattr(self.trainer.logger, 'log_table'):
+    #         # Log some generated samples
+    #         self.trainer.logger.log_table(
+    #           key=f"samples@global_step{self.global_step}",
+    #           columns=['Generated Samples'],
+    #           data=[[s] for s in decoded_samples])
+    gc.collect()
+    torch.cuda.empty_cache()
+    self._restore_non_ema_params()
+  def _sample_prior(self, *batch_dims):
+    if self.diffusion == 'absorbing_state':
+      return self.mask_index * torch.ones(
+        *batch_dims, dtype=torch.int64, device=self.device)
+    if self.diffusion == 'uniform':
+      return torch.randint(
+        0, self.vocab_size, batch_dims, dtype=torch.int64,
+        device=self.device)
+    elif self.diffusion == 'uniform_data_marginals':
+      if self.limiting_distribution.squeeze().ndim == 2:
+        batch_dims = (batch_dims[0],)
+      return torch.distributions.Categorical(
+        self.limiting_distribution.squeeze()).sample(
+        sample_shape=torch.Size(batch_dims))
+    raise NotImplementedError(
+      f'Diffusion type {self.diffusion} not '
+      'implemented.')
+  def sample(
+    self,
+    eps=1e-5,
+    target_sequence: torch.tensor = None,
+    target_motifs: torch.tensor = None,
+    classifier_model = None):  # Note: differs from self.config.training.sampling_eps
+    """Generate samples from (ema) model.
+      Supports both AR and diffusion sampling.
+      Supports:
+        - standard decoding,
+        - classifier-free guidance,
+        - classifier-based guidance
+          - CBG / FUDGE,
+          - NOS / PPLM.
+    """
+    # WARNING: Lightning auto-casting is not working in this method.
+    if not self.config.eval.disable_ema:
+      self.load_ema_params()
+    if getattr(self.config, 'guidance', None) is not None:
+      if self.config.guidance.method == 'cfg':
+        cond = (torch.ones(self.config.sampling.batch_size, device=self.device) *
+                self.config.guidance.condition).to(torch.long)
+      else:
+        cond = None
+      if ((self.parameterization == 'ar' and self.config.guidance.method in {'fudge', 'pplm'})
+          or self.config.guidance.method in {'cbg', 'nos'}):
+        if classifier_model is None:
+          classifier_model = classifier.Classifier.load_from_checkpoint(
+            self.config.guidance.classifier_checkpoint_path,
+            tokenizer=self.tokenizer,
+            config=self.config, logger=False)
+        classifier_model = classifier_model.to(self.device)
+        classifier_model.eval()
+      else:
+        classifier_model = None
+    else:
+      classifier_model, cond = None, None
+    if self.parameterization == 'ar':
+      samples = self._ar_sample(
+        classifier_model=classifier_model, cond=cond)
+    else:  # Diffusion sampling
+      samples = self._diffusion_sample(
+        classifier_model=classifier_model, cond=cond,
+        eps=eps,
+        target_sequence=target_sequence,
+        target_motifs=target_motifs)
+    if not self.config.eval.disable_ema:
+      self._restore_non_ema_params()
+    return samples
+  @torch.no_grad()
+  def _ar_sample(
+      self,
+      classifier_model: typing.Optional[classifier.Classifier] = None,
+      cond: typing.Optional[torch.tensor] = None,
+  ):
+    # precompute token buffer
+    num_pred_tokens = self.config.model.length - 1
+    x = torch.zeros(
+      (self.config.sampling.batch_size, num_pred_tokens + 1),
+      dtype=torch.long,
+      device=self.device)
+    x[:, 0] = self.tokenizer.bos_token_id
+    # precompute Gumbel sampling noise
+    if (getattr(self.config, 'guidance', None) is not None
+        and self.config.guidance.method == 'fudge'):
+      noise = torch.distributions.Gumbel(0, 1).sample(
+        (self.config.sampling.batch_size,  # type: ignore
+         num_pred_tokens,
+         self.config.guidance.topk)).to(self.device)
+    else:
+      noise = torch.distributions.Gumbel(0, 1).sample(
+        (self.config.sampling.batch_size,  # type: ignore
+          num_pred_tokens,
+          self.vocab_size)).to(self.device)
+    if self.config.sampling.use_float64:
+      noise = noise.to(torch.float64)
+    pbar = tqdm(range(num_pred_tokens), desc='AR Sampling',
+                  leave=False)
+    inference_params = InferenceParams(
+      max_seqlen=num_pred_tokens,
+      max_batch_size=x.shape[0],
+      seqlen_offset=1)
+    # For cfg we do 2 forward passes, one for conditional
+    # model and one unconditional, so we need 2 copies of
+    # inference_params.
+    uncond_inference_params = InferenceParams(
+      max_seqlen=num_pred_tokens,
+      max_batch_size=x.shape[0],
+      seqlen_offset=1)
+    for i in pbar:
+      if getattr(self.config, 'guidance', None) is None:
+        if self.config.backbone == 'dimamba':
+          log_probs = self.forward(
+            x[:, i:i + 1], None, cond=None,
+            inference_params=inference_params)
+        else:
+          log_probs = self.forward(x[:, :i + 1],
+                                  None, cond=None)
+        if self.config.sampling.use_float64:
+          log_probs = log_probs.to(torch.float64)
+        next_log_probs = log_probs[:, -1]
+        y = (next_log_probs + noise[:, i]).argmax(-1)
+      else:
+        if self.config.guidance.method == 'cfg':
+          if self.config.backbone == 'dimamba':
+            next_log_probs = self._ar_cfg_denoise(
+              cond=cond,
+              gamma=self.config.guidance.gamma,
+              x=x[:, i:i + 1],
+              i=i,
+              inference_params=(inference_params, uncond_inference_params))
+          else:
+            next_log_probs = self._ar_cfg_denoise(
+              cond=cond,
+              gamma=self.config.guidance.gamma,
+              x=x,
+              i=i)
+          y = (next_log_probs + noise[:, i]).argmax(-1)
+        elif self.config.guidance.method == 'fudge':
+          if self.config.backbone == 'dimamba':
+            next_log_probs, top_indices = self._ar_fudge_denoise(
+              classifier_model=classifier_model,
+              guidance_cond=self.config.guidance.condition,
+              topk=self.config.guidance.topk,
+              gamma=self.config.guidance.gamma,
+              x=x[:, i:i + 1],
+              i=i,
+              inference_params=inference_params)
+          else:
+            next_log_probs, top_indices = self._ar_fudge_denoise(
+              classifier_model=classifier_model,
+              guidance_cond=self.config.guidance.condition,
+              topk=self.config.guidance.topk,
+              gamma=self.config.guidance.gamma,
+              x=x,
+              i=i)
+          y = torch.gather(
+            top_indices,
+            1,
+            (next_log_probs + noise[:, i]).argmax(-1).unsqueeze(1)
+          ).squeeze(1)
+        elif self.config.guidance.method == 'pplm':
+          raise NotImplementedError
+        else:
+          raise NotImplementedError(
+            f"Guidance method {self.config.guidance.method} not implemented.")
+      pbar.set_postfix(
+        prob_check=(next_log_probs.exp().sum() / x.shape[0]).item(),
+        nan_check=bool(next_log_probs.isnan().sum() > 0))
+      x[:, i + 1] = y
+    return x
+  def _ar_cfg_denoise(
+      self,
+      cond: torch.tensor,
+      gamma: float,
+      x: torch.tensor,
+      i: int,
+      **kwargs
+  ) -> torch.tensor:
+    if self.config.guidance.gamma == 0.0:  # Sample unconditionally
+      mask_cond = (torch.ones_like(cond) *
+                   self.config.data.num_classes)
+      if self.config.backbone == 'dimamba':
+        inference_params = kwargs.pop('inference_params')
+        log_probs = self.forward(
+          x[:, :i + 1],None, cond=mask_cond,
+          inference_params=inference_params[1])
+      else:
+        log_probs = self.forward(
+          x[:, :i + 1],None, cond=mask_cond, **kwargs)
+    elif gamma == 1.0:  # Sample conditionally
+      if self.config.backbone == 'dimamba':
+        inference_params = kwargs.pop('inference_params')
+        log_probs = self.forward(
+          x[:, :i + 1], None, cond=cond,
+          inference_params=inference_params[0])
+      else:
+        log_probs = self.forward(
+          x[:, :i + 1], None, cond=cond, **kwargs)
+    else:  # Sample from tempered distribution
+        mask_cond = (torch.ones_like(cond) *
+                     self.config.data.num_classes)
+        if self.config.backbone == 'dimamba':
+          inference_params = kwargs.pop('inference_params')
+          log_probs_cond = self.forward(
+            x[:, :i + 1], None, cond=cond,
+            inference_params=inference_params[0])
+          log_probs_uncond = self.forward(
+            x[:, :i + 1],None, cond=mask_cond,
+            inference_params=inference_params[1])
+        else:
+          log_probs_cond = self.forward(
+            x[:, :i + 1], None, cond=cond, **kwargs)
+          log_probs_uncond = self.forward(
+            x[:, :i + 1],None, cond=mask_cond, **kwargs)
+        log_probs = gamma * log_probs_cond + (1 - gamma) * log_probs_uncond
+        # Gamma > 1.0 causes instability for Mamba, re-normalizing
+        log_probs = log_probs.log_softmax(dim=-1)
+    return log_probs[:, -1]
+  def _ar_fudge_denoise(
+      self,
+      classifier_model: classifier.Classifier,
+      guidance_cond: int,
+      topk: int,
+      gamma: float,
+      x: torch.tensor,
+      i: int,
+      **kwargs
+  ) -> typing.Tuple[torch.tensor, torch.LongTensor]:
+    log_probs = self.forward(
+      x[:, :i + 1], None, cond=None, **kwargs)
+    next_log_probs = log_probs[:, -1]
+    top_logits, top_indices = next_log_probs.topk(topk, dim=-1)
+    t_candidates = torch.cat(
+      [x[:, :i + 1].unsqueeze(1).expand(-1, topk, -1),
+        top_indices.unsqueeze(2)],
+      dim=2).view(-1, i + 2)  # (B * K), L
+    t = torch.zeros(t_candidates.shape[0],
+                    device=self.device)
+    sigma, dsigma = self.noise(t)
+    time_conditioning = sigma[:, None]
+    classifier_log_prob = classifier_model.get_log_probs(
+      t_candidates, time_conditioning)
+    classifier_log_prob = classifier_log_prob[:, i + 1, :].view(
+      x.shape[0], topk, -1)[..., guidance_cond]  # (batch, topk)
+    next_log_probs = (top_logits + gamma * classifier_log_prob).log_softmax(dim=-1)
+    return next_log_probs, top_indices
+  def _ar_pplm_denoise(
+      self,
+      classifier_model: classifier.Classifier,
+      guidance_cond: int,
+      num_ppl_steps: int,
+      pplm_step_size: float,
+      pplm_stability_coef: float,
+      x: torch.tensor,
+      i: int,
+  ):
+    raise NotImplementedError
+  @torch.no_grad()
+  def _diffusion_sample(
+    self,
+    classifier_model: typing.Optional[classifier.Classifier] = None,
+    cond: typing.Optional[torch.tensor] = None,
+    eps: float = 1e-5,  # Note: differs from self.config.training.sampling_eps
+    target_sequence: torch.tensor = None,
+    target_motifs: torch.tensor = None,
+  ):
+    xt = self._sample_prior(
+      self.config.sampling.batch_size,
+      self.config.model.length
+    ).to(self.device)
+    timesteps = torch.linspace(
+      1, eps, self.config.sampling.steps + 1, device=self.device)
+    dt = (1 - eps) / self.config.sampling.steps
+    pbar = tqdm(range(self.config.sampling.steps),
+                desc='Sampling',
+                leave=False)
+    NFEs = 0
+    cache = None
+    for i in pbar:
+      t = timesteps[i]
+      if self.T > 0:  # t in {1/T,..., 1}, to match training
+        t = (t * self.T).to(torch.int)
+        t = t / self.T
+        t += (1 / self.T)
+      t = t * torch.ones(xt.shape[0], 1, device=self.device)
+      if cache is None:
+        NFEs += 1
+      sigma_t, _ = self.noise(t)
+      sigma_s, _ = self.noise(t - dt)
+      if sigma_t.ndim > 1:
+        sigma_t = sigma_t.squeeze(-1)
+      if sigma_s.ndim > 1:
+        sigma_s = sigma_s.squeeze(-1)
+      assert sigma_t.ndim == 1, sigma_t.shape
+      assert sigma_s.ndim == 1, sigma_s.shape
+      move_chance_t = 1 - torch.exp(-sigma_t)
+      move_chance_s = 1 - torch.exp(-sigma_s)
+      move_chance_t = move_chance_t[:, None, None]
+      move_chance_s = move_chance_s[:, None, None]
+      assert move_chance_t.ndim == 3, move_chance_t.shape
+      if getattr(self.config, 'guidance', None) is None:
+        xs, q_xs, cache = self._ddpm_denoise(
+          xt=xt,
+          time_conditioning=sigma_t,
+          move_chance_t=move_chance_t,
+          move_chance_s=move_chance_s,
+          cache=cache)
+      else:
+        if self.config.guidance.method == 'cfg':
+          xs, q_xs, cache = self._cfg_denoise(
+            cond=cond,
+            gamma=self.config.guidance.gamma,
+            xt=xt,
+            time_conditioning=sigma_t,
+            move_chance_t=move_chance_t,
+            move_chance_s=move_chance_s,
+            cache=cache)
+        elif self.config.guidance.method == 'cbg':
+          xs, q_xs, cache = self._cbg_denoise(
+            classifier_model=classifier_model,
+            conditioning_class=self.config.guidance.condition,
+            gamma=self.config.guidance.gamma,
+            use_approx=self.config.guidance.use_approx,
+            xt=xt,
+            time_conditioning=sigma_t,
+            move_chance_t=move_chance_t,
+            move_chance_s=move_chance_s,
+            target_sequence=target_sequence,
+            target_motifs=target_motifs,
+            cache=cache)
+        elif self.config.guidance.method == 'nos':
+          xs, q_xs, cache = self._nos_denoise(
+            classifier_model=classifier_model,
+            conditioning_class=self.config.guidance.condition,
+            num_nos_steps=self.config.guidance.num_nos_steps,
+            nos_step_size=self.config.guidance.nos_step_size,
+            nos_stability_coef=self.config.guidance.nos_stability_coef,
+            xt=xt,
+            time_conditioning=sigma_t,
+            move_chance_t=move_chance_t,
+            move_chance_s=move_chance_s)
+        else:
+          raise NotImplementedError(
+            f"Guidance method {self.config.guidance.method} not implemented.")
+      pbar.set_postfix(
+        NFEs=NFEs,
+        prob_check=(q_xs.sum() / xt.numel()).item(),
+        nan_check=bool(q_xs.isnan().sum() > 0))
+      if (not self.config.sampling.use_cache or
+          not torch.allclose(xs, xt)):
+        # Disable caching
+        cache = None
+      xt = xs
+    return xt
+  def _ddpm_denoise(
+    self,
+    xt: torch.tensor,
+    time_conditioning: torch.tensor,
+    move_chance_t: torch.tensor,
+    move_chance_s: torch.tensor,
+    cache: typing.Optional[typing.Dict[str, torch.Tensor]] = None,
+  ) -> typing.Tuple[torch.tensor, torch.tensor, typing.Dict[str, torch.tensor]]:
+    # Compute x_theta
+    if cache is not None:
+      log_x_theta = cache['log_x_theta']
+    else:
+      log_x_theta = self.forward(xt, time_conditioning,
+                                 cond=None)
+      if self.config.sampling.use_float64:
+        log_x_theta = log_x_theta.to(torch.float64)
+    x_theta = log_x_theta.exp()
+    # Compute posterior
+    if self.diffusion == 'absorbing_state':
+      q_xs = x_theta * (move_chance_t - move_chance_s)
+      q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+      q_xs /= move_chance_t
+    elif self.diffusion == 'uniform':
+      q_xs = self._compute_posterior(
+        x=x_theta,
+        xt=xt,
+        alpha_s=1 - move_chance_s,
+        alpha_t=1 - move_chance_t)
+    else:
+      raise NotImplementedError(
+        f"Diffusion type {self.diffusion} not implemented.")
+    # Sample from posterior
+    xs = _sample_categorical(q_xs)
+    if self.diffusion == 'absorbing_state':
+      copy_flag = (xt != self.mask_index).to(torch.bool)
+      q_xs[copy_flag] = 0.0
+      q_xs[copy_flag, xt[copy_flag]] = 1.0
+      xs = torch.where(copy_flag, xt, xs)
+    return xs, q_xs, {'log_x_theta': log_x_theta}
+  def _cfg_denoise(
+      self,
+      cond: torch.tensor,
+      gamma: float,
+      xt: torch.tensor,
+      time_conditioning: torch.tensor,
+      move_chance_t: torch.tensor,
+      move_chance_s: torch.tensor,
+      cache: typing.Optional[typing.Dict[str, torch.Tensor]] = None,
+  ) -> typing.Tuple[torch.tensor, torch.tensor, typing.Dict[str, torch.tensor]]:
+    # Compute log_x_theta
+    if cache is not None:
+      log_x_theta_uncond = cache['log_x_theta_uncond']
+      log_x_theta_cond = cache['log_x_theta_cond']
+    else:
+      if gamma == 0.0:  # Sample unconditionally
+        mask_cond = (torch.ones_like(cond) *
+                     self.config.data.num_classes)
+        log_x_theta_uncond = self.forward(
+          xt, time_conditioning, cond=mask_cond)
+        log_x_theta_cond = None
+      elif gamma == 1.0:  # Sample conditionally
+        log_x_theta_cond = self.forward(xt, time_conditioning,
+                                     cond=cond)
+        log_x_theta_uncond = None
+      else:  # Sample from tempered distribution
+        log_x_theta_cond = self.forward(xt, time_conditioning,
+                                     cond=cond)
+        mask_cond = (torch.ones_like(cond) *
+                     self.config.data.num_classes)
+        log_x_theta_uncond = self.forward(xt,
+                                       time_conditioning,
+                                       cond=mask_cond)
+    # Compute (weighted) posterior
+    if (log_x_theta_cond is None  # gamma == 0
+        or log_x_theta_uncond is None):  # or gamma == 1
+      log_x_theta = log_x_theta_uncond if log_x_theta_uncond is not None else log_x_theta_cond
+      x_theta = log_x_theta.exp()
+      if self.diffusion == 'absorbing_state':
+        q_xs = x_theta * (move_chance_t - move_chance_s)
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        q_xs /= move_chance_t
+      elif self.diffusion == 'uniform':
+        q_xs = self._compute_posterior(
+          x=x_theta,
+          xt=xt,
+          alpha_s=1 - move_chance_s,
+          alpha_t=1 - move_chance_t)
+      else:
+        raise NotImplementedError(
+          f"Diffusion type {self.diffusion} not implemented.")
+    else:  # gamma != 0 and gamma != 1
+      if self.diffusion == 'absorbing_state':
+        log_x_theta = (gamma * log_x_theta_cond + (1 - gamma) * log_x_theta_uncond)
+        x_theta = log_x_theta.softmax(dim=-1)
+        q_xs = x_theta * (move_chance_t - move_chance_s)
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        q_xs /= move_chance_t
+      elif (self.diffusion == 'uniform'
+            or self.diffusion == 'uniform_data_marginals'):
+        log_q_xs_uncond = self._compute_posterior(
+          x=log_x_theta_uncond.exp(),
+          xt=xt,
+          alpha_s=1 - move_chance_s,
+          alpha_t=1 - move_chance_t).log()
+        log_q_xs_cond = self._compute_posterior(
+          x=log_x_theta_cond.exp(),
+          xt=xt,
+          alpha_s=1 - move_chance_s,
+          alpha_t=1 - move_chance_t).log()
+        log_q_xs = (gamma * log_q_xs_cond +
+                    (1 - gamma) * log_q_xs_uncond)
+        q_xs = log_q_xs.softmax(dim=-1)
+      else:
+        raise NotImplementedError(
+          f"Diffusion type {self.diffusion} not implemented.")
+    # Sample from posterior
+    xs = _sample_categorical(q_xs)
+    if self.diffusion == 'absorbing_state':
+      copy_flag = (xt != self.mask_index).to(torch.bool)
+      q_xs[copy_flag] = 0.0
+      q_xs[copy_flag, xt[copy_flag]] = 1.0
+      xs = torch.where(copy_flag, xt, xs)
+    return xs, q_xs, {'log_x_theta_uncond': log_x_theta_uncond,
+                      'log_x_theta_cond': log_x_theta_cond}
+  def _cbg_denoise(
+      self,
+      conditioning_class: int,
+      gamma: float,
+      classifier_model: classifier.Classifier,
+      xt: torch.tensor,
+      time_conditioning: torch.tensor,
+      move_chance_t: torch.tensor,
+      move_chance_s: torch.tensor,
+      target_sequence: torch.tensor = None,
+      target_motifs: torch.tensor = None,
+      use_approx: bool = False,  # whether to use first-order approximation
+      cache: typing.Optional[typing.Dict[str, torch.Tensor]] = None,
+  ) -> typing.Tuple[torch.tensor, torch.tensor, typing.Dict[str, torch.tensor]]:
+    if cache is not None:
+      log_x_theta = cache['log_x_theta']
+      classifier_log_prob = cache['classifier_log_prob']
+    else:
+      # Diffusion model
+      log_x_theta = self.forward(xt, time_conditioning,
+                                 cond=None)
+      # Classifier model
+      if use_approx:
+        xt_one_hot = torch.nn.functional.one_hot(
+          xt, self.vocab_size).to(torch.float)
+        with torch.enable_grad():
+          xt_one_hot.requires_grad_(True)
+          classifier_log_prob_xt = classifier_model.get_log_probs(
+            xt_one_hot, time_conditioning)
+          classifier_log_prob_xt[..., conditioning_class].sum().backward()
+          grad_log_prob_xt = xt_one_hot.grad
+        classifier_log_prob_ratio = (
+            grad_log_prob_xt - (xt_one_hot * grad_log_prob_xt).sum(dim=-1, keepdim=True)
+        ).detach().requires_grad_(False)
+        classifier_log_prob = (
+            classifier_log_prob_ratio +
+            classifier_log_prob_xt[..., conditioning_class][..., None, None]
+        ).detach().requires_grad_(False)
+      else:
+        # Copied from https://github.com/hnisonoff/discrete_guidance/blob/main/src/fm_utils.py#L441
+        bsz, seq_len = xt.shape
+        # Create bsz*seq_len*N copies of input sequences
+        # Shape: (bsz, 1, seq_len) -> (bsz, seq_len*N, seq_len)
+        # (where N = vocab_size).
+        xt_expand = xt.unsqueeze(1).repeat(1, seq_len * self.vocab_size, 1)
+        # Flatten batch and transition dimensions
+        # Shape: (bsz, seq_len*N, seq_len) -> (bsz*seq_len*N, seq_len)
+        xt_expand = xt_expand.view(-1, seq_len)
+        # Create indices for all possible transitions
+        # Shape: (seq_len*N,) -> (bsz, seq_len*N) -> (bsz*seq_len*N,)
+        jump_idx = torch.arange(seq_len * self.vocab_size).to(xt.device)
+        jump_idx = jump_idx.repeat(bsz, 1).flatten()
+        # Create tensor for states after one transition
+        xt_jumps = xt_expand.clone()
+        # Calculate which dimension changes for each transition
+        # Shape: (bsz*seq_len*N,)
+        jump_dims = jump_idx // self.vocab_size
+        # Calculate new value for changed dimension
+        # Shape: (bsz*seq_len*N,)
+        jump_states = jump_idx % self.vocab_size
+        # Apply transitions by assigning new values at transition dimensions
+        # Shape: (bsz*seq_len*N, seq_len)
+        xt_jumps[
+          torch.arange(jump_idx.size(0), device=xt.device),
+          jump_dims,  # Index the transitioned dimension
+        ] = jump_states  # Assign the new state
+        # classifier_log_prob = (classifier_model.get_log_probs(
+        #   xt_jumps, time_conditioning.repeat(seq_len * self.vocab_size)
+        # ))[..., conditioning_class].reshape(bsz, seq_len, self.vocab_size)
+        target_sequence = target_sequence.to(self.device)
+        mask_vec = torch.tensor([1 if i-1 in target_motifs else 0 for i in range(target_sequence.shape[1])]).to(self.device)
+        bindevaluator_probs = classifier_model.get_probs(
+          xt_jumps, target_sequence.repeat(xt_jumps.shape[0], 1)
+        )
+        # pdb.set_trace()
+        bindevaluator_probs = torch.where(bindevaluator_probs == 0, torch.tensor(1e-8, dtype=bindevaluator_probs.dtype), bindevaluator_probs)
+        classifier_log_prob = torch.log(bindevaluator_probs) * mask_vec
+        # pdb.set_trace()
+        classifier_log_prob = classifier_log_prob.sum(dim=-1) / mask_vec.sum()
+        classifier_log_prob = classifier_log_prob.reshape(bsz, seq_len, self.vocab_size)
+        # classifier_log_prob = (torch.exp(classifier_model.get_log_probs(
+        #   xt_jumps, target_sequence.repeat(xt_jumps.shape[0], 1)
+        # )) * mask_vec).sum(dim=-1).log().reshape(bsz, seq_len, self.vocab_size)
+        # (bsz, seq_len, N) / (bsz, seq_len, N, tgt_len)
+        # pdb.set_trace()
+    # Compute unguided posterior
+    if self.diffusion == 'absorbing_state':
+      diffusion_log_probs = log_x_theta + torch.log(
+        1. - (move_chance_s / move_chance_t))
+      diffusion_log_probs[..., self.mask_index] = torch.log(
+        move_chance_s / move_chance_t)[:, :, 0]
+      diffusion_log_probs.detach()
+    elif self.diffusion == 'uniform':
+      diffusion_log_probs = self._compute_posterior(
+        x=log_x_theta.exp(),
+        xt=xt,
+        alpha_s=1 - move_chance_s,
+        alpha_t=1 - move_chance_t).log()
+    else:
+      raise NotImplementedError(
+        f"Diffusion type {self.diffusion} not implemented.")
+    # Apply guidance
+    with torch.no_grad():
+      if self.diffusion == 'absorbing_state':
+        guided_log_probs = (gamma * classifier_log_prob) + diffusion_log_probs
+        copy_flag = (xt != self.mask_index)
+        guided_log_probs[copy_flag] = self.neg_infinity
+        guided_log_probs[copy_flag, xt[copy_flag]] = 0.0
+      elif self.diffusion == 'uniform':
+        # pdb.set_trace()
+        guided_log_probs = (gamma * classifier_log_prob) + diffusion_log_probs
+      else:
+        raise NotImplementedError(
+          f"Diffusion type {self.diffusion} not implemented.")
+    guided_probs = guided_log_probs.softmax(dim=-1)
+    # Sample from guided posterior
+    xs = _sample_categorical(guided_probs)
+    if self.diffusion == 'absorbing_state':
+      xs = torch.where(copy_flag.to(bool), xt, xs)
+    return xs, guided_probs, {'log_x_theta': log_x_theta,
+                              'classifier_log_prob': classifier_log_prob}
+  def _nos_denoise(
+      self,
+      classifier_model: classifier.Classifier,
+      num_nos_steps: int,
+      nos_step_size: float,
+      nos_stability_coef: float,
+      conditioning_class: int,
+      xt: torch.Tensor,
+      time_conditioning: torch.tensor,
+      move_chance_t: torch.tensor,
+      move_chance_s: torch.tensor,
+  ) -> typing.Tuple[torch.tensor, torch.tensor, None]:
+    # Compute original diffusion_log_probs and hidden states
+    copy_flag = (xt != self.mask_index).to(torch.bool)
+    with torch.no_grad():
+      time_conditioning = self._process_sigma(time_conditioning)
+      with torch.cuda.amp.autocast(dtype=torch.float32):
+        logits, hidden_states = self.backbone(
+          xt, time_conditioning, cond=None,
+          return_hidden_states=True)
+        if self.parameterization == 'subs':
+          log_x_theta = self._subs_parameterization(
+            logits=logits, xt=xt)
+        elif self.parameterization == 'd3pm':
+          # returns log_probs
+          if self.subs_masking:  # Can use "zero masking prob"
+            logits[:, :,
+            self.mask_index] += self.neg_infinity
+          log_x_theta = logits.log_softmax(dim=-1)
+        else:
+          raise NotImplementedError(
+            f"Parameterization {self.parameterization} not implemented for NOS guidance.")
+        if self.diffusion == 'absorbing_state':
+          diffusion_log_probs = log_x_theta + torch.log(
+            1. - (move_chance_s / move_chance_t))
+          diffusion_log_probs[..., self.mask_index] = torch.log(
+            move_chance_s / move_chance_t)[:, :, 0]
+          diffusion_log_probs[copy_flag] = self.neg_infinity
+          diffusion_log_probs[copy_flag, xt[copy_flag]] = 0.0
+        elif self.diffusion == 'uniform':
+          diffusion_log_probs = self._compute_posterior(
+            x=log_x_theta.exp(),
+            xt=xt,
+            alpha_s=1 - move_chance_s,
+            alpha_t=1 - move_chance_t).log()
+    # Perform NOS steps
+    kl_loss = torch.nn.KLDivLoss(reduction='batchmean',
+                                 log_target=True)
+    delta = torch.nn.Parameter(
+      torch.zeros_like(hidden_states[-1]),
+      requires_grad=True)
+    optimizer = torch.optim.Adagrad([delta], lr=nos_step_size)
+    with torch.enable_grad():
+      for _ in tqdm(range(num_nos_steps),
+                    desc='NOS', leave=False):
+        h_current = hidden_states[-1] + delta
+        target_loss = classifier_model.get_log_probs(
+          xt, time_conditioning, x_emb=h_current)[..., conditioning_class].sum()
+        with torch.cuda.amp.autocast(dtype=torch.float32):
+          new_logits = self.forward(xt, time_conditioning,
+                                    cond=None,
+                                    x_emb=h_current)
+        if self.diffusion == 'absorbing_state':
+          adjusted_log_probs = new_logits + torch.log(
+            1. - (move_chance_s / move_chance_t))
+          adjusted_log_probs[
+            ..., self.mask_index] = torch.log(
+            move_chance_s / move_chance_t)[:, :, 0]
+          adjusted_log_probs[
+            copy_flag] = self.neg_infinity
+          adjusted_log_probs[copy_flag, xt[copy_flag]] = 0.0
+        elif self.diffusion == 'uniform':
+          adjusted_log_probs = self._compute_posterior(
+            x=new_logits.exp(),
+            xt=xt,
+            alpha_s=1 - move_chance_s,
+            alpha_t=1 - move_chance_t).log()
+        kl = kl_loss(adjusted_log_probs, diffusion_log_probs)
+        loss = -target_loss + nos_stability_coef * kl
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    with torch.cuda.amp.autocast(dtype=torch.float32):
+      guided_logits = self.forward(
+        xt, time_conditioning,
+        cond=None,
+        x_emb=hidden_states[-1] + delta.data)
+    if self.diffusion == 'absorbing_state':
+      diffusion_log_probs = guided_logits + torch.log(
+        1. - (move_chance_s / move_chance_t))
+      diffusion_log_probs[
+        ..., self.mask_index] = torch.log(
+        move_chance_s / move_chance_t)[:, :, 0]
+      diffusion_log_probs.detach()
+      guided_probs = diffusion_log_probs.exp()
+    elif self.diffusion == 'uniform':
+      guided_probs = self._compute_posterior(
+        x=guided_logits.exp(),
+        xt=xt,
+        alpha_s=1 - move_chance_s,
+        alpha_t=1 - move_chance_t).detach()
+    else:
+      raise NotImplementedError(
+        f"Diffusion type {self.diffusion} not implemented.")
+    xs = _sample_categorical(guided_probs)
+    if self.diffusion == 'absorbing_state':
+      xs = torch.where(copy_flag, xt, xs)
+    return xs, guided_probs, None

eval_utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import torch
+import transformers
+from tqdm import tqdm
+import diffusion
+def compute_ppl(
+    pretrained_model,
+    val_ds
+):
+  ppl_metrics = diffusion.Perplexity().to('cuda')
+  pbar = tqdm(val_ds, desc='PPL')
+  for batch in pbar:
+    input_ids = batch['input_ids'].to('cuda')
+    if 'attention_mask' in batch:
+      attention_mask = batch['attention_mask'].to('cuda')
+    else:
+      attention_mask = None
+    losses = pretrained_model._loss(input_ids, attention_mask)
+    ppl_metrics.update(losses.nlls, losses.token_mask)
+    pbar.set_postfix({'ppl': ppl_metrics.compute().item()})
+  return ppl_metrics.compute().item()
+def compute_generative_ppl(
+    sentences,
+    eval_model_name_or_path,
+    gen_ppl_eval_batch_size=8,
+    max_length=128):
+  gen_ppl_metric = diffusion.Perplexity().to('cuda')
+  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+  eval_model_tokenizer = transformers.AutoTokenizer.from_pretrained(
+    eval_model_name_or_path)
+  if eval_model_tokenizer.pad_token is None:
+    eval_model_tokenizer.pad_token = \
+      eval_model_tokenizer.eos_token
+    eval_model_tokenizer.pad_token_id = \
+      eval_model_tokenizer.eos_token_id
+  eval_model = transformers.AutoModelForCausalLM.from_pretrained(
+    eval_model_name_or_path).eval()
+  if max_length is None:
+    max_length = max_length
+  eval_model = eval_model.to('cuda')
+  # Re-tokenize using eval model's tokenizer
+  tokenizer_kwargs = {
+    'return_tensors': 'pt',
+    'return_token_type_ids': False,
+    'return_attention_mask': True,
+    'truncation': True,
+    'padding': True,
+    'max_length': max_length,
+  }
+  eval_context_size = 1024
+  samples = eval_model_tokenizer(
+    sentences, **tokenizer_kwargs)
+  attn_mask = samples['attention_mask']
+  samples = samples['input_ids']
+  attn_mask = attn_mask.to('cuda')
+  samples = samples.to('cuda')
+  num_batches = samples.shape[0] // gen_ppl_eval_batch_size
+  for i in tqdm(range(num_batches),
+                desc='Gen. PPL', leave=False):
+    _samples = torch.split(
+      samples[i * gen_ppl_eval_batch_size: (i + 1) * gen_ppl_eval_batch_size],
+      eval_context_size,
+      dim=-1)
+    _attn_mask = torch.split(
+      attn_mask[i * gen_ppl_eval_batch_size: (i + 1) * gen_ppl_eval_batch_size],
+      eval_context_size,
+      dim=-1)
+    for (sample_chunk, attn_mask_chunk) in zip(
+        _samples, _attn_mask):
+      logits = eval_model(
+        sample_chunk, attention_mask=attn_mask_chunk)[0]
+      logits = logits.transpose(-1, -2)
+      nlls = torch.nn.functional.cross_entropy(
+        logits[..., :-1],
+        sample_chunk[..., 1:],
+        reduction='none')
+      # first_eos = (sample_chunk == eval_model_tokenizer.eos_token_id).cumsum(-1) == 1
+      # token_mask = (sample_chunk != eval_model_tokenizer.eos_token_id)
+      # gen_ppl_metric.update(
+      #   nlls, first_eos[..., 1:] + token_mask[..., 1:])
+      gen_ppl_metric.update(
+        nlls, attn_mask_chunk[..., 1:])
+  return gen_ppl_metric.compute().item()

noise_schedule.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import abc
+import torch
+import torch.nn as nn
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def get_noise(config, dtype=torch.float32):
+  if config.noise.type == 'geometric':
+    return GeometricNoise(config.noise.sigma_min,
+                          config.noise.sigma_max)
+  elif config.noise.type == 'loglinear':
+    return LogLinearNoise()
+  elif config.noise.type == 'cosine':
+    return CosineNoise()
+  elif config.noise.type == 'cosinesqr':
+    return CosineSqrNoise()
+  elif config.noise.type == 'linear':
+    return Linear(config.noise.sigma_min,
+                  config.noise.sigma_max,
+                  dtype)
+  else:
+    raise NotImplementedError(
+      f'{config.noise.type} noise schedule is not '
+      f'implemented.')
+def binary_discretization(z):
+  z_hard = torch.sign(z)
+  z_soft = z / torch.norm(z, dim=-1, keepdim=True)
+  return z_soft + (z_hard - z_soft).detach()
+class Noise(abc.ABC, nn.Module):
+  """
+  Base Noise class.
+  Defines forward signature, which returns:
+  total and rate of noise for a given timestep.
+  """
+  def forward(self, t):
+    # Assume time goes from 0 to 1
+    return self.total_noise(t), self.rate_noise(t)
+  @abc.abstractmethod
+  def rate_noise(self, t):
+    """
+    Rate of change of noise, i.e. g(t)
+    """
+    pass
+  @abc.abstractmethod
+  def total_noise(self, t):
+    """
+    Total noise ie \int_0^t g(t) dt + g(0)
+    """
+    pass
+class CosineNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * torch.cos(t * torch.pi / 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi / 2)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2)
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class CosineSqrNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * (
+      torch.cos(t * torch.pi / 2) ** 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2) ** 2
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class Linear(Noise):
+  def __init__(self, sigma_min=0, sigma_max=10,
+               dtype=torch.float32):
+    super().__init__()
+    self.sigma_min = torch.tensor(sigma_min, dtype=dtype)
+    self.sigma_max = torch.tensor(sigma_max, dtype=dtype)
+  def rate_noise(self, t):
+    return self.sigma_max - self.sigma_min
+  def total_noise(self, t):
+    return (self.sigma_min + t *
+            (self.sigma_max - self.sigma_min))
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(
+      -torch.exp(t * f_T + (1 - t) * f_0))
+    return (sigma_t - self.sigma_min) / (
+      self.sigma_max - self.sigma_min)
+class GeometricNoise(Noise):
+  def __init__(self, sigma_min=1e-3, sigma_max=1):
+    super().__init__()
+    self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
+  def rate_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (
+      self.sigmas[1].log() - self.sigmas[0].log())
+  def total_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
+class LogLinearNoise(Noise):
+  """Log Linear noise schedule.
+  Built such that 1 - 1/e^(n(t)) interpolates between 0 and
+  ~1 when t varies from 0 to 1. Total noise is
+  -log(1 - (1 - eps) * t), so the sigma will be
+  (1 - eps) * t.
+  """
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+    self.sigma_max = self.total_noise(torch.tensor(1.0))
+    self.sigma_min = self.eps + self.total_noise(
+      torch.tensor(0.0))
+  def rate_noise(self, t):
+    return (1 - self.eps) / (1 - (1 - self.eps) * t)
+  def total_noise(self, t):
+    return -torch.log1p(-(1 - self.eps) * t)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    t = - torch.expm1(- sigma_t) / (1 - self.eps)
+    return t

requirements.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+name: ct_udlm
+channels:
+  - pytorch
+  - nvidia
+  - anaconda
+  - defaults
+dependencies:
+  - cuda-nvcc=12.4.99
+  - ipykernel=6.29.5
+  - ipython=8.15.0
+  - ipywidgets=8.1.2
+  - pip=23.3.1
+  - python=3.9.20
+  - pip:
+      - biopython==1.84
+      - causal-conv1d==1.4.0
+      - datasets==2.18.0
+      - einops==0.8.0
+      - flash-attn==2.7.2.post1
+      - fsspec==2024.2.0
+      - git-lfs==1.6
+      - h5py==3.10.0
+      - huggingface-hub==0.26.2
+      - hydra-core==1.3.2
+      - ipdb==0.13.13
+      - jupyter==1.1.1
+      - jupyterlab==4.1.8
+      - lightning==2.2.1
+      - lightning-utilities==0.11.9
+      - mamba-ssm==1.2.0.post1
+      - matplotlib==3.9.2
+      - notebook==7.1.1
+      - numpy==1.26.4
+      - omegaconf==2.3.0
+      - pandas==2.2.1
+      - pytorch-image-generation-metrics==0.6.1
+      - rdkit==2024.3.6
+      - regex==2024.11.6
+      - rich==13.7.1
+      - safetensors==0.4.5
+      - scikit-learn==1.4.0
+      - scipy==1.13.1
+      - seaborn==0.13.2
+      - timm==0.9.16
+      - tokenizers==0.15.2
+      - torchmetrics==1.6.0
+      - tqdm==4.67.0
+      - transformers==4.38.2
+      - wandb==0.13.5

sample.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import hydra
+import lightning as L
+import numpy as np
+import omegaconf
+import pandas as pd
+import rdkit
+import rich.syntax
+import rich.tree
+import torch
+from tqdm.auto import tqdm
+import pdb
+import dataloader
+import diffusion
+from models.bindevaluator import BindEvaluator
+rdkit.rdBase.DisableLog('rdApp.error')
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+omegaconf.OmegaConf.register_new_resolver(
+  'if_then_else',
+  lambda condition, x, y: x if condition else y
+)
+def _print_config(
+    config: omegaconf.DictConfig,
+    resolve: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style,
+                        guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+def parse_motif(motif: str) -> list:
+    parts = motif.split(',')
+    result = []
+    for part in parts:
+        part = part.strip()
+        if '-' in part:
+            start, end = map(int, part.split('-'))
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return torch.tensor(result)
+@hydra.main(version_base=None, config_path='./configs',
+            config_name='config')
+def main(config: omegaconf.DictConfig) -> None:
+  # Reproducibility
+  L.seed_everything(config.seed)
+  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+  torch.use_deterministic_algorithms(True)
+  torch.backends.cudnn.benchmark = False
+#   _print_config(config, resolve=True)
+  print(f"Checkpoint: {config.eval.checkpoint_path}")
+  tokenizer = dataloader.get_tokenizer(config)
+  target_sequence = tokenizer(config.eval.target_sequence, return_tensors='pt')['input_ids']
+  pretrained = diffusion.Diffusion.load_from_checkpoint(
+    config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=config, logger=False)
+  pretrained.eval()
+  bindevaluator = BindEvaluator.load_from_checkpoint(
+    config.guidance.classifier_checkpoint_path,
+    n_layers=8,
+    d_model=128,
+    d_hidden=128,
+    n_head=8,
+    d_k=64,
+    d_v=128,
+    d_inner=64)
+  samples = []
+  for _ in tqdm(
+      range(config.sampling.num_sample_batches),
+      desc='Gen. batches', leave=False):
+    sample = pretrained.sample(
+      target_sequence = target_sequence,
+      target_motifs = parse_motif(config.eval.target_motifs),
+      classifier_model = bindevaluator
+    )
+    # print(f"Batch took {time.time() - start:.2f} seconds.")
+    samples.extend(
+      pretrained.tokenizer.batch_decode(sample))
+    print([sample.replace(' ', '')[5:-5] for sample in samples])
+  samples = [sample.replace(' ', '')[5:-5] for sample in samples]
+  print(samples)
+if __name__ == '__main__':
+  main()

tokenizer.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""Custom Tokenization classes."""
+import collections
+import json
+import os
+import re
+from typing import List, Optional, Tuple, Union
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json'}
+PRETRAINED_VOCAB_FILES_MAP = {
+    'qm9': {
+        'vocab_file': {
+            'yairschiff/qm9-tokenizer': 'https://huggingface.co/yairschiff/qm9-tokenizer/resolve/main/vocab.json'
+        }
+    },
+    'zinc250k': {
+        'vocab_file': {
+            'yairschiff/zinc250k-tokenizer': 'https://huggingface.co/yairschiff/zinc250k-tokenizer/resolve/main/vocab.json'
+        }
+    }
+}
+class SMILESTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a tokenizer for SMILES datasets.
+    Based on regex.
+    This tokenizer inherits from [`PreTrainedTokenizer`]
+    which contains most of the main methods. Users should
+    refer to this superclass for more information regarding
+    those methods.
+    Adapted from:
+        https://huggingface.co/ibm/MoLFormer-XL-both-10pct
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token not in the vocabulary
+            cannot be converted to an ID and is set to be
+            this token instead.
+        sep_token (`str`, *optional*, defaults to `"<eos>"`):
+            The separator token, which is used when building
+            a sequence from multiple sequences, e.g., two
+            sequences for sequence classification or for a
+            text and a question for question answering.
+            It is also used as the last token of a sequence
+            built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example, when
+            batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"<bos>"`):
+            The classifier token which is used when doing
+            sequence classification (classification of the
+            whole sequence
+            instead of per-token classification). It is the
+            first token of the sequence when built with
+            special tokens.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the
+            token used when training this model with masked
+            language modeling. This is the token, which the
+            model will try to predict.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        unk_token='<unk>',
+        sep_token='<eos>',
+        pad_token='<pad>',
+        cls_token='<bos>',
+        mask_token='<mask>',
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path"
+                f"'{vocab_file}'."
+            )
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            vocab_from_file = json.load(vocab_handle)
+        # Re-index to account for special tokens
+        self.vocab = {
+            cls_token: 0,
+            sep_token: 1,
+            mask_token: 2,
+            pad_token: 3,
+            unk_token: 4,
+            **{k: v + 5 for k, v in vocab_from_file.items()}
+        }
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        # Regex pattern taken from:
+        #  https://github.com/pschwllr/MolecularTransformer
+        self.pattern = (
+            r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
+        )
+        self.regex_tokenizer = re.compile(self.pattern)
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text, **kwargs):
+        split_tokens = self.regex_tokenizer.findall(text)
+        return split_tokens
+    def _convert_token_to_id(self, token):
+        """Converts token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """Converts sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).strip()
+        return out_string
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of
+        sequences for sequence classification tasks by
+        concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will
+                be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence
+                pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids)
+            with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no
+        special tokens added. This method is called when
+        adding special tokens using the tokenizer
+        `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether the token list is already formatted
+                with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range
+            [0, 1]: 1 for a special token, 0 for a sequence
+            token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True
+            )
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be
+        used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        If `token_ids_1` is `None`, this method only returns
+        the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence
+                pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(
+        self, save_directory: str,
+        filename_prefix: Optional[str] = None
+    ) -> Union[Tuple[str],  None]:
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f"Vocabulary path ({save_directory}) should"
+                "be a directory.")
+            return None
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(
+                json.dumps(
+                    self.vocab,
+                    indent=2,
+                    sort_keys=True,
+                    ensure_ascii=False
+                ) + "\n")
+        return (vocab_file,)
+class QM9Tokenizer(SMILESTokenizer):
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP['qm9']
+class Zinc250kTokenizer(SMILESTokenizer):
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP['zinc250k']

uncond_sample.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import hydra
+import lightning as L
+import numpy as np
+import omegaconf
+import pandas as pd
+import rdkit
+import rich.syntax
+import rich.tree
+import torch
+from tqdm.auto import tqdm
+import pdb
+import csv
+import dataloader
+import diffusion
+rdkit.rdBase.DisableLog('rdApp.error')
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+omegaconf.OmegaConf.register_new_resolver(
+  'if_then_else',
+  lambda condition, x, y: x if condition else y
+)
+def _print_config(
+    config: omegaconf.DictConfig,
+    resolve: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style,
+                        guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+def parse_range(tgt_range: str) -> list:
+    parts = tgt_range.split(',')
+    result = []
+    for part in parts:
+        part = part.strip()
+        if '-' in part:
+            start, end = map(int, part.split('-'))
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return result
+@hydra.main(version_base=None, config_path='./configs',
+            config_name='config')
+def main(config: omegaconf.DictConfig) -> None:
+  # Reproducibility
+  L.seed_everything(config.seed)
+  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+  torch.use_deterministic_algorithms(True)
+  torch.backends.cudnn.benchmark = False
+#   _print_config(config, resolve=True)
+  print(f"Checkpoint: {config.eval.checkpoint_path}")
+  tokenizer = dataloader.get_tokenizer(config)
+  pretrained = diffusion.Diffusion.load_from_checkpoint(
+    config.eval.checkpoint_path,
+    tokenizer=tokenizer,
+    config=config, logger=False)
+  pretrained.eval()
+  target_lengths = parse_range(config.model.length_range)
+  for length in target_lengths:
+    config.model.length = length + 2
+    samples = []
+    for _ in tqdm(
+        range(config.sampling.num_sample_batches),
+        desc='Gen. batches', leave=False):
+        sample = pretrained.sample()
+        # print(f"Batch took {time.time() - start:.2f} seconds.")
+        samples.extend(
+        pretrained.tokenizer.batch_decode(sample))
+        # print([sample.replace(' ', '')[5:-5] for sample in samples])
+    samples = [sample.replace(' ', '')[5:-5] for sample in samples]
+    print(samples)
+    # df = pd.DataFrame(samples, columns=['sequence'])
+    # df.to_csv(f'/home/tc415/discrete-diffusion-guidance/samples/{length}.csv', index=False)
+if __name__ == '__main__':
+  main()

utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Console logger utilities.
+Copied from https://github.com/HazyResearch/transformers/blob/master/src/utils/utils.py
+Copied from https://docs.python.org/3/howto/logging-cookbook.html#using-a-context-manager-for-selective-logging
+"""
+import logging
+import fsspec
+import lightning
+import torch
+from timm.scheduler import CosineLRScheduler
+def fsspec_exists(filename):
+  """Check if a file exists using fsspec."""
+  fs, _ = fsspec.core.url_to_fs(filename)
+  return fs.exists(filename)
+def fsspec_listdir(dirname):
+  """Listdir in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  return fs.ls(dirname)
+def fsspec_mkdirs(dirname, exist_ok=True):
+  """Mkdirs in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  fs.makedirs(dirname, exist_ok=exist_ok)
+def print_nans(tensor, name):
+  if torch.isnan(tensor).any():
+    print(name, tensor)
+class CosineDecayWarmupLRScheduler(
+  CosineLRScheduler,
+  torch.optim.lr_scheduler._LRScheduler):
+  """Wrap timm.scheduler.CosineLRScheduler
+  Enables calling scheduler.step() without passing in epoch.
+  Supports resuming as well.
+  Adapted from:
+    https://github.com/HazyResearch/hyena-dna/blob/main/src/utils/optim/schedulers.py
+  """
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._last_epoch = -1
+    self.step(epoch=0)
+  def step(self, epoch=None):
+    if epoch is None:
+      self._last_epoch += 1
+    else:
+      self._last_epoch = epoch
+    # We call either step or step_update, depending on
+    # whether we're using the scheduler every epoch or every
+    # step.
+    # Otherwise, lightning will always call step (i.e.,
+    # meant for each epoch), and if we set scheduler
+    # interval to "step", then the learning rate update will
+    # be wrong.
+    if self.t_in_epochs:
+      super().step(epoch=self._last_epoch)
+    else:
+      super().step_update(num_updates=self._last_epoch)
+def get_logger(name=__name__, level=logging.INFO) -> logging.Logger:
+  """Initializes multi-GPU-friendly python logger."""
+  logger = logging.getLogger(name)
+  logger.setLevel(level)
+  # this ensures all logging levels get marked with the rank zero decorator
+  # otherwise logs would get multiplied for each GPU process in multi-GPU setup
+  for level in ('debug', 'info', 'warning', 'error',
+                'exception', 'fatal', 'critical'):
+    setattr(logger,
+            level,
+            lightning.pytorch.utilities.rank_zero_only(
+              getattr(logger, level)))
+  return logger