Spaces:

herilalaina
/

lcpfn

Running

App Files Files Community

herilalaina commited on Oct 15, 2024

Commit

b62776c

•

1 Parent(s): 3866017

update lcpfn

Browse files

Files changed (16) hide show

lcpfn/__init__.py +40 -13
lcpfn/bar_distribution.py +143 -63
lcpfn/decoders.py +21 -9
lcpfn/domhan_prior.py +7 -3
lcpfn/encoders.py +55 -26
lcpfn/initializers.py +3 -1
lcpfn/layer.py +80 -27
lcpfn/model.py +35 -8
lcpfn/positional_encodings.py +31 -23
lcpfn/train.py +0 -266
lcpfn/train_lcpfn.py +9 -5
lcpfn/transformer.py +184 -62
lcpfn/utils.py +206 -55
lcpfn/version.py +1 -0
pyproject.toml +42 -0
requirements.txt +0 -4

lcpfn/__init__.py CHANGED Viewed

@@ -1,53 +1,80 @@
 import os, sys
 sys.path.insert(0, os.path.dirname(__file__))
-model_path = 'trained_models'
 def prepare_models():
     pfns4bo_dir = os.path.dirname(__file__)
-    model_names = ['pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt',
-                   'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt']
     for name in model_names:
         weights_path = os.path.join(pfns4bo_dir, model_path, name)
-        compressed_weights_path = os.path.join(pfns4bo_dir, model_path, name + '.gz')
         if not os.path.exists(weights_path):
             if not os.path.exists(compressed_weights_path):
                 print("Downloading", os.path.abspath(compressed_weights_path))
                 import requests
-                url = f'https://github.com/automl/lcpfn/raw/main/lcpfn/trained_models/{name + ".gz"}'
                 r = requests.get(url, allow_redirects=True)
                 os.makedirs(os.path.dirname(compressed_weights_path), exist_ok=True)
-                with open(compressed_weights_path, 'wb') as f:
                     f.write(r.content)
             if os.path.exists(compressed_weights_path):
                 print("Unzipping", name)
                 os.system(f"gzip -dk {compressed_weights_path}")
             else:
                 print("Failed to find", compressed_weights_path)
-                print("Make sure you have an internet connection to download the model automatically..")
         if os.path.exists(weights_path):
             print("Successfully located model at", weights_path)
 model_dict = {
-    'EMSIZE512_NLAYERS12_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
-                                              'pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt'),
-    'EMSIZE512_NLAYERS6_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
-                                    'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt'),
 }
 def __getattr__(name):
     if name in model_dict:
         if not os.path.exists(model_dict[name]):
-            print("Can't find", os.path.abspath(model_dict[name]), "thus unzipping/downloading models now.")
             print("This might take a while..")
             prepare_models()
         return model_dict[name]
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 from lcpfn.model import LCPFN
 from lcpfn.train_lcpfn import train_lcpfn
-from lcpfn.domhan_prior import sample_from_prior, create_get_batch_func

 import os, sys
 sys.path.insert(0, os.path.dirname(__file__))
+model_path = "trained_models"
 def prepare_models():
     pfns4bo_dir = os.path.dirname(__file__)
+    model_names = [
+        "pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt",
+        "pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt",
+    ]
     for name in model_names:
         weights_path = os.path.join(pfns4bo_dir, model_path, name)
+        compressed_weights_path = os.path.join(pfns4bo_dir, model_path, name + ".gz")
         if not os.path.exists(weights_path):
             if not os.path.exists(compressed_weights_path):
                 print("Downloading", os.path.abspath(compressed_weights_path))
                 import requests
+                url = f'https://ml.informatik.uni-freiburg.de/research-artifacts/lcpfn/{name + ".gz"}'
                 r = requests.get(url, allow_redirects=True)
                 os.makedirs(os.path.dirname(compressed_weights_path), exist_ok=True)
+                with open(compressed_weights_path, "wb") as f:
                     f.write(r.content)
             if os.path.exists(compressed_weights_path):
                 print("Unzipping", name)
                 os.system(f"gzip -dk {compressed_weights_path}")
             else:
                 print("Failed to find", compressed_weights_path)
+                print(
+                    "Make sure you have an internet connection to download the model automatically.."
+                )
         if os.path.exists(weights_path):
             print("Successfully located model at", weights_path)
 model_dict = {
+    "EMSIZE512_NLAYERS12_NBUCKETS1000": os.path.join(
+        os.path.dirname(__file__),
+        model_path,
+        "pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt",
+    ),
+    "EMSIZE512_NLAYERS6_NBUCKETS1000": os.path.join(
+        os.path.dirname(__file__),
+        model_path,
+        "pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt",
+    ),
 }
 def __getattr__(name):
     if name in model_dict:
         if not os.path.exists(model_dict[name]):
+            print(
+                "Can't find",
+                os.path.abspath(model_dict[name]),
+                "thus unzipping/downloading models now.",
+            )
             print("This might take a while..")
             prepare_models()
         return model_dict[name]
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+from .version import __version__
 from lcpfn.model import LCPFN
 from lcpfn.train_lcpfn import train_lcpfn
+from lcpfn.domhan_prior import sample_from_prior, create_get_batch_func
+__all__ = [
+    "LCPFN",
+    "train_lcpfn",
+    "sample_from_prior",
+    "create_get_batch_func",
+    "__version__",
+]

lcpfn/bar_distribution.py CHANGED Viewed

@@ -3,19 +3,25 @@ from torch import nn
 class BarDistribution(nn.Module):
-    def __init__(self, borders: torch.Tensor, smoothing=.0): # here borders should start with min and end with max, where all values lie in (min,max) and are sorted
         # sorted list of borders
         super().__init__()
         assert len(borders.shape) == 1
-        #self.borders = borders
-        self.register_buffer('borders', borders)
-        self.register_buffer('smoothing', torch.tensor(smoothing))
-        #self.bucket_widths = self.borders[1:] - self.borders[:-1]
-        self.register_buffer('bucket_widths', self.borders[1:] - self.borders[:-1])
         full_width = self.bucket_widths.sum()
         border_order = torch.argsort(borders)
-        assert (full_width - (self.borders[-1] - self.borders[0])).abs() < 1e-4, f'diff: {full_width - (self.borders[-1] - self.borders[0])}'
-        assert (border_order == torch.arange(len(borders)).to(border_order.device)).all(), "Please provide sorted borders!"
         self.num_bars = len(borders) - 1
     def map_to_bucket_idx(self, y):
@@ -24,28 +30,35 @@ class BarDistribution(nn.Module):
         target_sample[y == self.borders[-1]] = self.num_bars - 1
         return target_sample
-    def forward(self, logits, y): # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
         target_sample = self.map_to_bucket_idx(y)
-        assert (target_sample >= 0).all() and (target_sample < self.num_bars).all(), f'y {y} not in support set for borders (min_y, max_y) {self.borders}'
-        assert logits.shape[-1] == self.num_bars, f'{logits.shape[-1]} vs {self.num_bars}'
         bucket_log_probs = torch.log_softmax(logits, -1)
         scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
-        #print(bucket_log_probs, logits.shape)
-        nll_loss = -scaled_bucket_log_probs.gather(-1,target_sample.unsqueeze(-1)).squeeze(-1)
         smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
-        smoothing = self.smoothing if self.training else 0.
-        loss = (1. - smoothing) * nll_loss + smoothing * smooth_loss
         return loss
     def mean(self, logits):
-        bucket_means = self.borders[:-1] + self.bucket_widths/2
         p = torch.softmax(logits, -1)
         return p @ bucket_means
     def icdf(self, logits, left_prob):
         """
         Implementation of the quantile function
@@ -55,22 +68,32 @@ class BarDistribution(nn.Module):
         """
         probs = logits.softmax(-1)
         cumprobs = torch.cumsum(probs, -1)
-        idx = torch.searchsorted(cumprobs, left_prob * torch.ones(*cumprobs.shape[:-1], 1, device = probs.device))\
-            .squeeze(-1).clamp(0, cumprobs.shape[-1] - 1)  # this might not do the right for outliers
         cumprobs = torch.cat(
             [torch.zeros(*cumprobs.shape[:-1], 1, device=logits.device), cumprobs], -1
         )
         rest_prob = left_prob - cumprobs.gather(-1, idx[..., None]).squeeze(-1)
         left_border = self.borders[idx]
-        right_border = self.borders[idx+1]
-        return left_border + (right_border - left_border) * rest_prob / probs.gather(-1, idx[..., None]).squeeze(-1)
-    def quantile(self, logits, center_prob=.682):
-        side_probs = (1.-center_prob)/2
-        return torch.stack((self.icdf(logits, side_probs), self.icdf(logits, 1.-side_probs)),-1)
-    def ucb(self, logits, best_f, rest_prob=(1-.682)/2, maximize=True):
         """
         UCB utility. Rest Prob is the amount of utility above (below) the confidence interval that is ignored.
         Higher rest_prob is equivalent to lower beta in the standard GP-UCB formulation.
@@ -90,23 +113,41 @@ class BarDistribution(nn.Module):
     def mode(self, logits):
         mode_inds = logits.argmax(-1)
-        bucket_means = self.borders[:-1] + self.bucket_widths/2
         return bucket_means[mode_inds]
-    def ei(self, logits, best_f, maximize=True): # logits: evaluation_points x batch x feature_dim
-        bucket_means = self.borders[:-1] + self.bucket_widths/2
         if maximize:
             bucket_contributions = torch.tensor(
-                [max((bucket_max + max(bucket_min, best_f)) / 2 - best_f,0) for
-                 bucket_min, bucket_max, bucket_mean in zip(self.borders[:-1], self.borders[1:], bucket_means)], dtype=logits.dtype, device=logits.device)
         else:
             bucket_contributions = torch.tensor(
-                [-min((min(bucket_max,best_f) + bucket_min) / 2 - best_f,0) for # min on max instead of max on min, and compare min < instead of max >
-                 bucket_min, bucket_max, bucket_mean in zip(self.borders[:-1], self.borders[1:], bucket_means)], dtype=logits.dtype, device=logits.device)
         p = torch.softmax(logits, -1)
         return p @ bucket_contributions
-    def pi(self, logits, best_f, maximize=True):# logits: evaluation_points x batch x feature_dim
         """
         Acquisition Function: Probability of Improvement
         :param logits: as returned by Transformer
@@ -117,10 +158,9 @@ class BarDistribution(nn.Module):
         assert maximize is True
         p = torch.softmax(logits, -1)
         border_widths = self.borders[1:] - self.borders[:-1]
-        factor = 1. - ((best_f - self.borders[:-1]) / border_widths).clamp(0., 1.)
         return (p * factor).sum(-1)
     def mean_of_square(self, logits):
         """
         Computes E[x^2].
@@ -128,7 +168,11 @@ class BarDistribution(nn.Module):
         """
         left_borders = self.borders[:-1]
         right_borders = self.borders[1:]
-        bucket_mean_of_square = (left_borders.square() + right_borders.square() + left_borders*right_borders)/3.
         p = torch.softmax(logits, -1)
         return p @ bucket_mean_of_square
@@ -138,54 +182,74 @@ class BarDistribution(nn.Module):
 class FullSupportBarDistribution(BarDistribution):
     @staticmethod
-    def halfnormal_with_p_weight_before(range_max,p=.5):
-        s = range_max / torch.distributions.HalfNormal(torch.tensor(1.)).icdf(torch.tensor(p))
         return torch.distributions.HalfNormal(s)
-    def forward(self, logits, y): # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
         assert self.num_bars > 1
         target_sample = self.map_to_bucket_idx(y)
-        target_sample.clamp_(0,self.num_bars-1)
         assert logits.shape[-1] == self.num_bars
         bucket_log_probs = torch.log_softmax(logits, -1)
         scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
-        #print(bucket_log_probs, logits.shape)
-        log_probs = scaled_bucket_log_probs.gather(-1,target_sample.unsqueeze(-1)).squeeze(-1)
-        side_normals = (self.halfnormal_with_p_weight_before(self.bucket_widths[0]), self.halfnormal_with_p_weight_before(self.bucket_widths[-1]))
         # TODO look over it again
-        log_probs[target_sample == 0] += side_normals[0].log_prob((self.borders[1]-y[target_sample == 0]).clamp(min=.00000001)) + torch.log(self.bucket_widths[0])
-        log_probs[target_sample == self.num_bars-1] += side_normals[1].log_prob(y[target_sample == self.num_bars-1]-self.borders[-2]) + torch.log(self.bucket_widths[-1])
         nll_loss = -log_probs
         smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
-        smoothing = self.smoothing if self.training else 0.
-        loss = (1. - smoothing) * nll_loss + smoothing * smooth_loss
         return loss
     def mean(self, logits):
         bucket_means = self.borders[:-1] + self.bucket_widths / 2
         p = torch.softmax(logits, -1)
-        side_normals = (self.halfnormal_with_p_weight_before(self.bucket_widths[0]),
-                        self.halfnormal_with_p_weight_before(self.bucket_widths[-1]))
         bucket_means[0] = -side_normals[0].mean + self.borders[1]
         bucket_means[-1] = side_normals[1].mean + self.borders[-2]
         return p @ bucket_means
-def get_bucket_limits_(num_outputs:int, full_range:tuple=None, ys:torch.Tensor=None, verbose:bool=False):
     assert (ys is not None) or (full_range is not None)
     if ys is not None:
         ys = ys.flatten()
-        if len(ys) % num_outputs: ys = ys[:-(len(ys) % num_outputs)]
-        print(f'Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys.')
         ys_per_bucket = len(ys) // num_outputs
         if full_range is None:
             full_range = (ys.min(), ys.max())
@@ -193,17 +257,34 @@ def get_bucket_limits_(num_outputs:int, full_range:tuple=None, ys:torch.Tensor=N
             assert full_range[0] <= ys.min() and full_range[1] >= ys.max()
             full_range = torch.tensor(full_range)
         ys_sorted, ys_order = ys.sort(0)
-        bucket_limits = (ys_sorted[ys_per_bucket-1::ys_per_bucket][:-1]+ys_sorted[ys_per_bucket::ys_per_bucket])/2
         if verbose:
-            print(f'Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys.')
             print(full_range)
-        bucket_limits = torch.cat([full_range[0].unsqueeze(0), bucket_limits, full_range[1].unsqueeze(0)],0)
     else:
         class_width = (full_range[1] - full_range[0]) / num_outputs
-        bucket_limits = torch.cat([full_range[0] + torch.arange(num_outputs).float()*class_width, torch.tensor(full_range[1]).unsqueeze(0)], 0)
-    assert len(bucket_limits) - 1 == num_outputs and full_range[0] == bucket_limits[0] and full_range[-1] == bucket_limits[-1]
     return bucket_limits
@@ -266,4 +347,3 @@ def get_bucket_limits(
     ), f"{full_range[-1]} != {bucket_limits[-1]}"
     return bucket_limits

 class BarDistribution(nn.Module):
+    def __init__(
+        self, borders: torch.Tensor, smoothing=0.0
+    ):  # here borders should start with min and end with max, where all values lie in (min,max) and are sorted
         # sorted list of borders
         super().__init__()
         assert len(borders.shape) == 1
+        # self.borders = borders
+        self.register_buffer("borders", borders)
+        self.register_buffer("smoothing", torch.tensor(smoothing))
+        # self.bucket_widths = self.borders[1:] - self.borders[:-1]
+        self.register_buffer("bucket_widths", self.borders[1:] - self.borders[:-1])
         full_width = self.bucket_widths.sum()
         border_order = torch.argsort(borders)
+        assert (
+            full_width - (self.borders[-1] - self.borders[0])
+        ).abs() < 1e-4, f"diff: {full_width - (self.borders[-1] - self.borders[0])}"
+        assert (
+            border_order == torch.arange(len(borders)).to(border_order.device)
+        ).all(), "Please provide sorted borders!"
         self.num_bars = len(borders) - 1
     def map_to_bucket_idx(self, y):
         target_sample[y == self.borders[-1]] = self.num_bars - 1
         return target_sample
+    def forward(
+        self, logits, y
+    ):  # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
         target_sample = self.map_to_bucket_idx(y)
+        assert (target_sample >= 0).all() and (
+            target_sample < self.num_bars
+        ).all(), f"y {y} not in support set for borders (min_y, max_y) {self.borders}"
+        assert (
+            logits.shape[-1] == self.num_bars
+        ), f"{logits.shape[-1]} vs {self.num_bars}"
         bucket_log_probs = torch.log_softmax(logits, -1)
         scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
+        # print(bucket_log_probs, logits.shape)
+        nll_loss = -scaled_bucket_log_probs.gather(
+            -1, target_sample.unsqueeze(-1)
+        ).squeeze(-1)
         smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
+        smoothing = self.smoothing if self.training else 0.0
+        loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
         return loss
     def mean(self, logits):
+        bucket_means = self.borders[:-1] + self.bucket_widths / 2
         p = torch.softmax(logits, -1)
         return p @ bucket_means
     def icdf(self, logits, left_prob):
         """
         Implementation of the quantile function
         """
         probs = logits.softmax(-1)
         cumprobs = torch.cumsum(probs, -1)
+        idx = (
+            torch.searchsorted(
+                cumprobs,
+                left_prob * torch.ones(*cumprobs.shape[:-1], 1, device=probs.device),
+            )
+            .squeeze(-1)
+            .clamp(0, cumprobs.shape[-1] - 1)
+        )  # this might not do the right for outliers
         cumprobs = torch.cat(
             [torch.zeros(*cumprobs.shape[:-1], 1, device=logits.device), cumprobs], -1
         )
         rest_prob = left_prob - cumprobs.gather(-1, idx[..., None]).squeeze(-1)
         left_border = self.borders[idx]
+        right_border = self.borders[idx + 1]
+        return left_border + (right_border - left_border) * rest_prob / probs.gather(
+            -1, idx[..., None]
+        ).squeeze(-1)
+    def quantile(self, logits, center_prob=0.682):
+        side_probs = (1.0 - center_prob) / 2
+        return torch.stack(
+            (self.icdf(logits, side_probs), self.icdf(logits, 1.0 - side_probs)), -1
+        )
+    def ucb(self, logits, best_f, rest_prob=(1 - 0.682) / 2, maximize=True):
         """
         UCB utility. Rest Prob is the amount of utility above (below) the confidence interval that is ignored.
         Higher rest_prob is equivalent to lower beta in the standard GP-UCB formulation.
     def mode(self, logits):
         mode_inds = logits.argmax(-1)
+        bucket_means = self.borders[:-1] + self.bucket_widths / 2
         return bucket_means[mode_inds]
+    def ei(
+        self, logits, best_f, maximize=True
+    ):  # logits: evaluation_points x batch x feature_dim
+        bucket_means = self.borders[:-1] + self.bucket_widths / 2
         if maximize:
             bucket_contributions = torch.tensor(
+                [
+                    max((bucket_max + max(bucket_min, best_f)) / 2 - best_f, 0)
+                    for bucket_min, bucket_max, bucket_mean in zip(
+                        self.borders[:-1], self.borders[1:], bucket_means
+                    )
+                ],
+                dtype=logits.dtype,
+                device=logits.device,
+            )
         else:
             bucket_contributions = torch.tensor(
+                [
+                    -min((min(bucket_max, best_f) + bucket_min) / 2 - best_f, 0)
+                    for bucket_min, bucket_max, bucket_mean in zip(  # min on max instead of max on min, and compare min < instead of max >
+                        self.borders[:-1], self.borders[1:], bucket_means
+                    )
+                ],
+                dtype=logits.dtype,
+                device=logits.device,
+            )
         p = torch.softmax(logits, -1)
         return p @ bucket_contributions
+    def pi(
+        self, logits, best_f, maximize=True
+    ):  # logits: evaluation_points x batch x feature_dim
         """
         Acquisition Function: Probability of Improvement
         :param logits: as returned by Transformer
         assert maximize is True
         p = torch.softmax(logits, -1)
         border_widths = self.borders[1:] - self.borders[:-1]
+        factor = 1.0 - ((best_f - self.borders[:-1]) / border_widths).clamp(0.0, 1.0)
         return (p * factor).sum(-1)
     def mean_of_square(self, logits):
         """
         Computes E[x^2].
         """
         left_borders = self.borders[:-1]
         right_borders = self.borders[1:]
+        bucket_mean_of_square = (
+            left_borders.square()
+            + right_borders.square()
+            + left_borders * right_borders
+        ) / 3.0
         p = torch.softmax(logits, -1)
         return p @ bucket_mean_of_square
 class FullSupportBarDistribution(BarDistribution):
     @staticmethod
+    def halfnormal_with_p_weight_before(range_max, p=0.5):
+        s = range_max / torch.distributions.HalfNormal(torch.tensor(1.0)).icdf(
+            torch.tensor(p)
+        )
         return torch.distributions.HalfNormal(s)
+    def forward(
+        self, logits, y
+    ):  # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
         assert self.num_bars > 1
         target_sample = self.map_to_bucket_idx(y)
+        target_sample.clamp_(0, self.num_bars - 1)
         assert logits.shape[-1] == self.num_bars
         bucket_log_probs = torch.log_softmax(logits, -1)
         scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
+        # print(bucket_log_probs, logits.shape)
+        log_probs = scaled_bucket_log_probs.gather(
+            -1, target_sample.unsqueeze(-1)
+        ).squeeze(-1)
+        side_normals = (
+            self.halfnormal_with_p_weight_before(self.bucket_widths[0]),
+            self.halfnormal_with_p_weight_before(self.bucket_widths[-1]),
+        )
         # TODO look over it again
+        log_probs[target_sample == 0] += side_normals[0].log_prob(
+            (self.borders[1] - y[target_sample == 0]).clamp(min=0.00000001)
+        ) + torch.log(self.bucket_widths[0])
+        log_probs[target_sample == self.num_bars - 1] += side_normals[1].log_prob(
+            y[target_sample == self.num_bars - 1] - self.borders[-2]
+        ) + torch.log(self.bucket_widths[-1])
         nll_loss = -log_probs
         smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
+        smoothing = self.smoothing if self.training else 0.0
+        loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
         return loss
     def mean(self, logits):
         bucket_means = self.borders[:-1] + self.bucket_widths / 2
         p = torch.softmax(logits, -1)
+        side_normals = (
+            self.halfnormal_with_p_weight_before(self.bucket_widths[0]),
+            self.halfnormal_with_p_weight_before(self.bucket_widths[-1]),
+        )
         bucket_means[0] = -side_normals[0].mean + self.borders[1]
         bucket_means[-1] = side_normals[1].mean + self.borders[-2]
         return p @ bucket_means
+def get_bucket_limits_(
+    num_outputs: int,
+    full_range: tuple = None,
+    ys: torch.Tensor = None,
+    verbose: bool = False,
+):
     assert (ys is not None) or (full_range is not None)
     if ys is not None:
         ys = ys.flatten()
+        if len(ys) % num_outputs:
+            ys = ys[: -(len(ys) % num_outputs)]
+        print(
+            f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
+        )
         ys_per_bucket = len(ys) // num_outputs
         if full_range is None:
             full_range = (ys.min(), ys.max())
             assert full_range[0] <= ys.min() and full_range[1] >= ys.max()
             full_range = torch.tensor(full_range)
         ys_sorted, ys_order = ys.sort(0)
+        bucket_limits = (
+            ys_sorted[ys_per_bucket - 1 :: ys_per_bucket][:-1]
+            + ys_sorted[ys_per_bucket::ys_per_bucket]
+        ) / 2
         if verbose:
+            print(
+                f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
+            )
             print(full_range)
+        bucket_limits = torch.cat(
+            [full_range[0].unsqueeze(0), bucket_limits, full_range[1].unsqueeze(0)], 0
+        )
     else:
         class_width = (full_range[1] - full_range[0]) / num_outputs
+        bucket_limits = torch.cat(
+            [
+                full_range[0] + torch.arange(num_outputs).float() * class_width,
+                torch.tensor(full_range[1]).unsqueeze(0),
+            ],
+            0,
+        )
+    assert (
+        len(bucket_limits) - 1 == num_outputs
+        and full_range[0] == bucket_limits[0]
+        and full_range[-1] == bucket_limits[-1]
+    )
     return bucket_limits
     ), f"{full_range[-1]} != {bucket_limits[-1]}"
     return bucket_limits

lcpfn/decoders.py CHANGED Viewed

@@ -2,6 +2,14 @@ import torch
 from torch import nn
 import random
 class ScaledDecoder(nn.Module):
     def __init__(self, ninp, nhid, nout):
@@ -11,20 +19,24 @@ class ScaledDecoder(nn.Module):
         self.linear2 = nn.Linear(nhid, 10)
     def forward(self, x):
-        #return torch.cat([self.linear1(x), self.linear2(x)], -1)
         x = self.linear(x)
-        x = nn.GELU()(x)
-        temps = self.linear2(x).softmax(-1) @ torch.tensor([1.,1.4,1.7,2.,5.,10.,20.,40.,80.,160.], device=x.device)
-        if random.random() > .99:
-            print(temps.shape,temps[:,:2])
         return self.linear1(x) / temps.unsqueeze(-1)
 class FixedScaledDecoder(nn.Module):
     def __init__(self, ninp, nhid, nout):
         super().__init__()
-        self.mapper = nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, nout))
-        self.T = nn.Parameter(torch.ones(10000)/10000)
     def forward(self, x):
-        return self.mapper(x)/self.T.sum()

 from torch import nn
 import random
+from torch import Tensor
+import torch.nn.functional as F
+class GELU(nn.Module):
+    def forward(self, input: Tensor) -> Tensor:
+        return F.gelu(input)
 class ScaledDecoder(nn.Module):
     def __init__(self, ninp, nhid, nout):
         self.linear2 = nn.Linear(nhid, 10)
     def forward(self, x):
+        # return torch.cat([self.linear1(x), self.linear2(x)], -1)
         x = self.linear(x)
+        x = GELU()(x)
+        temps = self.linear2(x).softmax(-1) @ torch.tensor(
+            [1.0, 1.4, 1.7, 2.0, 5.0, 10.0, 20.0, 40.0, 80.0, 160.0], device=x.device
+        )
+        if random.random() > 0.99:
+            print(temps.shape, temps[:, :2])
         return self.linear1(x) / temps.unsqueeze(-1)
 class FixedScaledDecoder(nn.Module):
     def __init__(self, ninp, nhid, nout):
         super().__init__()
+        self.mapper = nn.Sequential(
+            nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, nout)
+        )
+        self.T = nn.Parameter(torch.ones(10000) / 10000)
     def forward(self, x):
+        return self.mapper(x) / self.T.sum()

lcpfn/domhan_prior.py CHANGED Viewed

@@ -58,7 +58,10 @@ def prior_weights(
 def sample_from_prior(rng, seq_len=100):
     return sample_prior_comb(
-        rng=rng, seq_len=seq_len, components=["pow3", "ilog2", "janoschek"], distribution="peaked"
     )
@@ -103,7 +106,7 @@ def sample_prior_comb(
         f_priors = {
             "pow3": uniform_prior_pow3,
             "ilog2": uniform_prior_ilog2,
-            "janoschek": uniform_prior_janoschek
         }
     else:
         raise NotImplemented()
@@ -153,6 +156,7 @@ def generate_prior_dataset(n, prior=sample_prior_comb, seed=42):
 def create_get_batch_func(prior):
     return partial(get_batch_domhan, prior=prior)
 # function producing batches for PFN training
 def get_batch_domhan(
     batch_size,
@@ -192,4 +196,4 @@ def get_batch_domhan(
     y_target = y_target.float()
     y_noisy = y_noisy.float()
-    return x, y_noisy, y_target

 def sample_from_prior(rng, seq_len=100):
     return sample_prior_comb(
+        rng=rng,
+        seq_len=seq_len,
+        components=["pow3", "ilog2", "janoschek"],
+        distribution="peaked",
     )
         f_priors = {
             "pow3": uniform_prior_pow3,
             "ilog2": uniform_prior_ilog2,
+            "janoschek": uniform_prior_janoschek,
         }
     else:
         raise NotImplemented()
 def create_get_batch_func(prior):
     return partial(get_batch_domhan, prior=prior)
 # function producing batches for PFN training
 def get_batch_domhan(
     batch_size,
     y_target = y_target.float()
     y_noisy = y_noisy.float()
+    return x, y_noisy, y_target

lcpfn/encoders.py CHANGED Viewed

@@ -18,34 +18,45 @@ class StyleEncoder(nn.Module):
 class _PositionalEncoding(nn.Module):
-    def __init__(self, d_model, dropout=0.):
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         self.d_model = d_model
-        self.device_test_tensor = nn.Parameter(torch.tensor(1.))
-    def forward(self, x):# T x B x num_features
-        assert self.d_model % x.shape[-1]*2 == 0
         d_per_feature = self.d_model // x.shape[-1]
         pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
-        #position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         interval_size = 10
-        div_term = (1./interval_size) * 2*math.pi*torch.exp(torch.arange(0, d_per_feature, 2, device=self.device_test_tensor.device).float()*math.log(math.sqrt(2)))
-        #print(div_term/2/math.pi)
         pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
         pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
-        return self.dropout(pe).view(x.shape[0],x.shape[1],self.d_model)
 Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)
 class EmbeddingEncoder(nn.Module):
     def __init__(self, num_features, em_size, num_embs=100):
         super().__init__()
         self.num_embs = num_embs
         self.embeddings = nn.Embedding(num_embs * num_features, em_size, max_norm=True)
-        self.init_weights(.1)
-        self.min_max = (-2,+2)
     @property
     def width(self):
@@ -60,7 +71,9 @@ class EmbeddingEncoder(nn.Module):
     def forward(self, x):  # T x B x num_features
         x_idxs = self.discretize(x)
-        x_idxs += torch.arange(x.shape[-1], device=x.device).view(1, 1, -1) * self.num_embs
         # print(x_idxs,self.embeddings.weight.shape)
         return self.embeddings(x_idxs).mean(-2)
@@ -72,7 +85,7 @@ class Normalize(nn.Module):
         self.std = std
     def forward(self, x):
-        return (x-self.mean)/self.std
 def get_normalized_uniform_encoder(encoder_creator):
@@ -83,13 +96,16 @@ def get_normalized_uniform_encoder(encoder_creator):
     :param encoder:
     :return:
     """
-    return lambda in_dim, out_dim: nn.Sequential(Normalize(.5, math.sqrt(1/12)), encoder_creator(in_dim, out_dim))
 Linear = nn.Linear
-MLP = lambda num_features, emsize: nn.Sequential(nn.Linear(num_features+1,emsize*2),
-                                                 nn.ReLU(),
-                                                 nn.Linear(emsize*2,emsize))
 class NanHandlingEncoder(nn.Module):
     def __init__(self, num_features, emsize, keep_nans=True):
@@ -101,10 +117,17 @@ class NanHandlingEncoder(nn.Module):
     def forward(self, x):
         if self.keep_nans:
-            x = torch.cat([torch.nan_to_num(x, nan=0.0), normalize_data(torch.isnan(x) * -1
-                                                          + torch.logical_and(torch.isinf(x), torch.sign(x) == 1) * 1
-                                                          + torch.logical_and(torch.isinf(x), torch.sign(x) == -1) * 2
-                                                          )], -1)
         else:
             x = torch.nan_to_num(x, nan=0.0)
         return self.layer(x)
@@ -124,24 +147,28 @@ class Linear(nn.Linear):
 class Conv(nn.Module):
     def __init__(self, input_size, emsize):
         super().__init__()
-        self.convs = torch.nn.ModuleList([nn.Conv2d(64 if i else 1, 64, 3) for i in range(5)])
-        self.linear = nn.Linear(64,emsize)
     def forward(self, x):
         size = math.isqrt(x.shape[-1])
-        assert size*size == x.shape[-1]
         x = x.reshape(*x.shape[:-1], 1, size, size)
         for conv in self.convs:
             if x.shape[-1] < 4:
                 break
             x = conv(x)
             x.relu_()
-        x = nn.AdaptiveAvgPool2d((1,1))(x).squeeze(-1).squeeze(-1)
         return self.linear(x)
 class CanEmb(nn.Embedding):
-    def __init__(self, num_features, num_embeddings: int, embedding_dim: int, *args, **kwargs):
         assert embedding_dim % num_features == 0
         embedding_dim = embedding_dim // num_features
         super().__init__(num_embeddings, embedding_dim, *args, **kwargs)
@@ -158,4 +185,6 @@ def get_Canonical(num_classes):
 def get_Embedding(num_embs_per_feature=100):
-    return lambda num_features, emsize: EmbeddingEncoder(num_features, emsize, num_embs=num_embs_per_feature)

 class _PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.0):
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         self.d_model = d_model
+        self.device_test_tensor = nn.Parameter(torch.tensor(1.0))
+    def forward(self, x):  # T x B x num_features
+        assert self.d_model % x.shape[-1] * 2 == 0
         d_per_feature = self.d_model // x.shape[-1]
         pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
+        # position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         interval_size = 10
+        div_term = (
+            (1.0 / interval_size)
+            * 2
+            * math.pi
+            * torch.exp(
+                torch.arange(
+                    0, d_per_feature, 2, device=self.device_test_tensor.device
+                ).float()
+                * math.log(math.sqrt(2))
+            )
+        )
+        # print(div_term/2/math.pi)
         pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
         pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
+        return self.dropout(pe).view(x.shape[0], x.shape[1], self.d_model)
 Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)
 class EmbeddingEncoder(nn.Module):
     def __init__(self, num_features, em_size, num_embs=100):
         super().__init__()
         self.num_embs = num_embs
         self.embeddings = nn.Embedding(num_embs * num_features, em_size, max_norm=True)
+        self.init_weights(0.1)
+        self.min_max = (-2, +2)
     @property
     def width(self):
     def forward(self, x):  # T x B x num_features
         x_idxs = self.discretize(x)
+        x_idxs += (
+            torch.arange(x.shape[-1], device=x.device).view(1, 1, -1) * self.num_embs
+        )
         # print(x_idxs,self.embeddings.weight.shape)
         return self.embeddings(x_idxs).mean(-2)
         self.std = std
     def forward(self, x):
+        return (x - self.mean) / self.std
 def get_normalized_uniform_encoder(encoder_creator):
     :param encoder:
     :return:
     """
+    return lambda in_dim, out_dim: nn.Sequential(
+        Normalize(0.5, math.sqrt(1 / 12)), encoder_creator(in_dim, out_dim)
+    )
 Linear = nn.Linear
+MLP = lambda num_features, emsize: nn.Sequential(
+    nn.Linear(num_features + 1, emsize * 2), nn.ReLU(), nn.Linear(emsize * 2, emsize)
+)
 class NanHandlingEncoder(nn.Module):
     def __init__(self, num_features, emsize, keep_nans=True):
     def forward(self, x):
         if self.keep_nans:
+            x = torch.cat(
+                [
+                    torch.nan_to_num(x, nan=0.0),
+                    normalize_data(
+                        torch.isnan(x) * -1
+                        + torch.logical_and(torch.isinf(x), torch.sign(x) == 1) * 1
+                        + torch.logical_and(torch.isinf(x), torch.sign(x) == -1) * 2
+                    ),
+                ],
+                -1,
+            )
         else:
             x = torch.nan_to_num(x, nan=0.0)
         return self.layer(x)
 class Conv(nn.Module):
     def __init__(self, input_size, emsize):
         super().__init__()
+        self.convs = torch.nn.ModuleList(
+            [nn.Conv2d(64 if i else 1, 64, 3) for i in range(5)]
+        )
+        self.linear = nn.Linear(64, emsize)
     def forward(self, x):
         size = math.isqrt(x.shape[-1])
+        assert size * size == x.shape[-1]
         x = x.reshape(*x.shape[:-1], 1, size, size)
         for conv in self.convs:
             if x.shape[-1] < 4:
                 break
             x = conv(x)
             x.relu_()
+        x = nn.AdaptiveAvgPool2d((1, 1))(x).squeeze(-1).squeeze(-1)
         return self.linear(x)
 class CanEmb(nn.Embedding):
+    def __init__(
+        self, num_features, num_embeddings: int, embedding_dim: int, *args, **kwargs
+    ):
         assert embedding_dim % num_features == 0
         embedding_dim = embedding_dim // num_features
         super().__init__(num_embeddings, embedding_dim, *args, **kwargs)
 def get_Embedding(num_embs_per_feature=100):
+    return lambda num_features, emsize: EmbeddingEncoder(
+        num_features, emsize, num_embs=num_embs_per_feature
+    )

lcpfn/initializers.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import torch
 from torch import nn
 def get_NormalInitializer(std):
     def initializer(m):
         if isinstance(m, nn.Linear):
             nn.init.normal_(m.weight, 0, std)
             nn.init.normal_(m.bias, 0, std)
-    return initializer

 import torch
 from torch import nn
 def get_NormalInitializer(std):
     def initializer(m):
         if isinstance(m, nn.Linear):
             nn.init.normal_(m.weight, 0, std)
             nn.init.normal_(m.bias, 0, std)
+    return initializer

lcpfn/layer.py CHANGED Viewed

@@ -36,15 +36,28 @@ class TransformerEncoderLayer(nn.Module):
         >>> src = torch.rand(32, 10, 512)
         >>> out = encoder_layer(src)
     """
-    __constants__ = ['batch_first']
-    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
-                 layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
-                 device=None, dtype=None, recompute_attn=False) -> None:
-        factory_kwargs = {'device': device, 'dtype': dtype}
         super().__init__()
-        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
-                                            **factory_kwargs)
         # Implementation of Feedforward model
         self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
         self.dropout = Dropout(dropout)
@@ -60,11 +73,16 @@ class TransformerEncoderLayer(nn.Module):
         self.activation = _get_activation_fn(activation)
     def __setstate__(self, state):
-        if 'activation' not in state:
-            state['activation'] = F.relu
         super().__setstate__(state)
-    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
         r"""Pass the input through the encoder layer.
         Args:
@@ -90,26 +108,61 @@ class TransformerEncoderLayer(nn.Module):
             num_train_tokens = trainset_src_mask.shape[0]
             global_tokens_src = src_[:num_global_tokens]
-            train_tokens_src = src_[num_global_tokens:num_global_tokens+num_train_tokens]
-            global_and_train_tokens_src = src_[:num_global_tokens+num_train_tokens]
-            eval_tokens_src = src_[num_global_tokens+num_train_tokens:]
-            attn = partial(checkpoint, self.self_attn) if self.recompute_attn else self.self_attn
-            global_tokens_src2 = attn(global_tokens_src, global_and_train_tokens_src, global_and_train_tokens_src, None, True, global_src_mask)[0]
-            train_tokens_src2 = attn(train_tokens_src, global_tokens_src, global_tokens_src, None, True, trainset_src_mask)[0]
-            eval_tokens_src2 = attn(eval_tokens_src, src_, src_,
-                                    None, True, valset_src_mask)[0]
-            src2 = torch.cat([global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0)
         else:
             if self.recompute_attn:
-                src2 = checkpoint(self.self_attn, src_, src_, src_, src_key_padding_mask, True, src_mask)[0]
             else:
-                src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask,
-                                      key_padding_mask=src_key_padding_mask)[0]
         src = src + self.dropout1(src2)
         if not self.pre_norm:
             src = self.norm1(src)
@@ -123,4 +176,4 @@ class TransformerEncoderLayer(nn.Module):
         if not self.pre_norm:
             src = self.norm2(src)
-        return src

         >>> src = torch.rand(32, 10, 512)
         >>> out = encoder_layer(src)
     """
+    __constants__ = ["batch_first"]
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        layer_norm_eps=1e-5,
+        batch_first=False,
+        pre_norm=False,
+        device=None,
+        dtype=None,
+        recompute_attn=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
+        self.self_attn = MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first, **factory_kwargs
+        )
         # Implementation of Feedforward model
         self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
         self.dropout = Dropout(dropout)
         self.activation = _get_activation_fn(activation)
     def __setstate__(self, state):
+        if "activation" not in state:
+            state["activation"] = F.relu
         super().__setstate__(state)
+    def forward(
+        self,
+        src: Tensor,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         r"""Pass the input through the encoder layer.
         Args:
             num_train_tokens = trainset_src_mask.shape[0]
             global_tokens_src = src_[:num_global_tokens]
+            train_tokens_src = src_[
+                num_global_tokens : num_global_tokens + num_train_tokens
+            ]
+            global_and_train_tokens_src = src_[: num_global_tokens + num_train_tokens]
+            eval_tokens_src = src_[num_global_tokens + num_train_tokens :]
+            attn = (
+                partial(checkpoint, self.self_attn)
+                if self.recompute_attn
+                else self.self_attn
+            )
+            global_tokens_src2 = attn(
+                global_tokens_src,
+                global_and_train_tokens_src,
+                global_and_train_tokens_src,
+                None,
+                True,
+                global_src_mask,
+            )[0]
+            train_tokens_src2 = attn(
+                train_tokens_src,
+                global_tokens_src,
+                global_tokens_src,
+                None,
+                True,
+                trainset_src_mask,
+            )[0]
+            eval_tokens_src2 = attn(
+                eval_tokens_src, src_, src_, None, True, valset_src_mask
+            )[0]
+            src2 = torch.cat(
+                [global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0
+            )
         else:
             if self.recompute_attn:
+                src2 = checkpoint(
+                    self.self_attn,
+                    src_,
+                    src_,
+                    src_,
+                    src_key_padding_mask,
+                    True,
+                    src_mask,
+                )[0]
             else:
+                src2 = self.self_attn(
+                    src_,
+                    src_,
+                    src_,
+                    attn_mask=src_mask,
+                    key_padding_mask=src_key_padding_mask,
+                )[0]
         src = src + self.dropout1(src2)
         if not self.pre_norm:
             src = self.norm1(src)
         if not self.pre_norm:
             src = self.norm2(src)
+        return src

lcpfn/model.py CHANGED Viewed

@@ -1,29 +1,56 @@
 import torch
 import lcpfn
 class LCPFN(torch.nn.Module):
     def __init__(self, model_name="EMSIZE512_NLAYERS12_NBUCKETS1000"):
         super(LCPFN, self).__init__()
-        self.model = torch.load(getattr(lcpfn, model_name) if model_name in lcpfn.model_dict else model_name)
         self.model.eval()
     @torch.no_grad()
-    def predict_mean(self, x_train, y_train, x_test):
-        logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
-        return self.model.criterion.mean(logits)
     @torch.no_grad()
-    def predict_quantiles(self, x_train, y_train, x_test, qs):
-        logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
-        return torch.cat([self.model.criterion.icdf(logits, q) for q in qs], dim=1)
     @torch.no_grad()
     def nll_loss(self, x_train, y_train, x_test, y_test):
         logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
         return self.model.criterion(logits, y_test)
     def forward(self, x_train, y_train, x_test):
         single_eval_pos = x_train.shape[0]
         x = torch.cat([x_train, x_test], dim=0).unsqueeze(1)
         y = y_train.unsqueeze(1)
-        return self.model((x, y), single_eval_pos=single_eval_pos)

 import torch
 import lcpfn
+import warnings
+from lcpfn import utils
 class LCPFN(torch.nn.Module):
     def __init__(self, model_name="EMSIZE512_NLAYERS12_NBUCKETS1000"):
         super(LCPFN, self).__init__()
+        self.model = torch.load(
+            getattr(lcpfn, model_name) if model_name in lcpfn.model_dict else model_name
+        )
         self.model.eval()
+    def check_input(self, x_train, x_test, y_train, y_test=None):
+        if torch.any(x_train < 0) or torch.any(x_test < 0):
+            # raise warning if input has negative values
+            raise Exception("x values should be non-negative")
+        if torch.any((0 > y_train) | (y_train > 1)) or (
+            y_test is not None and torch.any(0 < y_test < 1)
+        ):
+            # raise warning if input has values outside [0,1]
+            raise Exception(
+                "y values should be in the range [0,1]. Please set normalizer_kwargs accordingly."
+            )
     @torch.no_grad()
+    def predict_mean(
+        self, x_train, y_train, x_test, normalizer=utils.identity_normalizer()
+    ):
+        y_train_norm = normalizer[0](y_train)
+        logits = self(x_train=x_train, y_train=y_train_norm, x_test=x_test)
+        return normalizer[1](self.model.criterion.mean(logits))
     @torch.no_grad()
+    def predict_quantiles(
+        self, x_train, y_train, x_test, qs, normalizer=utils.identity_normalizer()
+    ):
+        y_train_norm = normalizer[0](y_train)
+        logits = self(x_train=x_train, y_train=y_train_norm, x_test=x_test)
+        return normalizer[1](
+            torch.cat([self.model.criterion.icdf(logits, q) for q in qs], dim=1)
+        )
     @torch.no_grad()
     def nll_loss(self, x_train, y_train, x_test, y_test):
+        # TODO add normalizer_kwargs
         logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
         return self.model.criterion(logits, y_test)
     def forward(self, x_train, y_train, x_test):
+        self.check_input(x_train, x_test, y_train)
         single_eval_pos = x_train.shape[0]
         x = torch.cat([x_train, x_test], dim=0).unsqueeze(1)
         y = y_train.unsqueeze(1)
+        return self.model((x, y), single_eval_pos=single_eval_pos)

lcpfn/positional_encodings.py CHANGED Viewed

@@ -15,7 +15,7 @@ class NoPositionalEncoding(nn.Module):
         pass
     def forward(self, x):
-        return x #* math.sqrt(x.shape[-1])
 class PositionalEncoding(nn.Module):
@@ -23,14 +23,16 @@ class PositionalEncoding(nn.Module):
         super(PositionalEncoding, self).__init__()
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer('pe', pe)
     def forward(self, x):
-        x = self.pe[:x.size(0), :] + x # * math.sqrt(x.shape[-1])
         return x
@@ -38,33 +40,39 @@ class LearnedPositionalEncoding(nn.Module):
     def __init__(self, d_model, max_len=5000):
         super(LearnedPositionalEncoding, self).__init__()
         self.max_seq_len = max_len
-        #self.positional_embeddings = nn.Embedding(max_len, d_model)
         self.positional_embeddings = nn.Parameter(torch.empty(max_len, d_model))
-        nn.init.normal_(self.positional_embeddings, mean=0, std=d_model ** -0.5)
     def forward(self, x):
         seq_len, bs, d_model = x.shape
-        assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
         pos_emb = self.positional_embeddings[:seq_len]
-        return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])
 class PairedScrambledPositionalEncodings(LearnedPositionalEncoding):
     # TODO check whether it is a problem to use the same perm. for full batch
     def forward(self, x):
         seq_len, bs, d_model = x.shape
-        assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
-        assert len(self.positional_embeddings) % 2 == 0, 'Please specify an even max_len.'
-        paired_embs = self.positional_embeddings.view(len(self.positional_embeddings), -1, 2)
-        pos_emb = paired_embs[torch.randperm(len(paired_embs))].view(*self.positional_embeddings.shape)[:seq_len]
-        return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])

         pass
     def forward(self, x):
+        return x  # * math.sqrt(x.shape[-1])
 class PositionalEncoding(nn.Module):
         super(PositionalEncoding, self).__init__()
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe)
     def forward(self, x):
+        x = self.pe[: x.size(0), :] + x  # * math.sqrt(x.shape[-1])
         return x
     def __init__(self, d_model, max_len=5000):
         super(LearnedPositionalEncoding, self).__init__()
         self.max_seq_len = max_len
+        # self.positional_embeddings = nn.Embedding(max_len, d_model)
         self.positional_embeddings = nn.Parameter(torch.empty(max_len, d_model))
+        nn.init.normal_(self.positional_embeddings, mean=0, std=d_model**-0.5)
     def forward(self, x):
         seq_len, bs, d_model = x.shape
+        assert seq_len <= len(
+            self.positional_embeddings
+        ), "seq_len can be at most max_len."
         pos_emb = self.positional_embeddings[:seq_len]
+        return (
+            pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x
+        )  # * math.sqrt(x.shape[-1])
 class PairedScrambledPositionalEncodings(LearnedPositionalEncoding):
     # TODO check whether it is a problem to use the same perm. for full batch
     def forward(self, x):
         seq_len, bs, d_model = x.shape
+        assert seq_len <= len(
+            self.positional_embeddings
+        ), "seq_len can be at most max_len."
+        assert (
+            len(self.positional_embeddings) % 2 == 0
+        ), "Please specify an even max_len."
+        paired_embs = self.positional_embeddings.view(
+            len(self.positional_embeddings), -1, 2
+        )
+        pos_emb = paired_embs[torch.randperm(len(paired_embs))].view(
+            *self.positional_embeddings.shape
+        )[:seq_len]
+        return (
+            pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x
+        )  # * math.sqrt(x.shape[-1])

lcpfn/train.py CHANGED Viewed

@@ -1,12 +1,7 @@
-import os
 import itertools
-import argparse
 import time
-import datetime
-import yaml
 from contextlib import nullcontext
-import pickle
 import torch
 from torch import nn
@@ -14,18 +9,11 @@ from lcpfn import utils
 from lcpfn.transformer import TransformerModel
 from lcpfn.bar_distribution import (
     BarDistribution,
-    FullSupportBarDistribution,
-    get_bucket_limits,
 )
 from lcpfn.utils import (
     get_cosine_schedule_with_warmup,
     get_openai_lr,
-    StoreDictKeyPair,
-    get_weighted_single_eval_pos_sampler,
-    get_uniform_single_eval_pos_sampler,
 )
-from lcpfn import priors
-from lcpfn import encoders
 from lcpfn import positional_encodings
 from lcpfn.utils import init_dist
 from torch.cuda.amp import autocast, GradScaler
@@ -294,7 +282,6 @@ def train(
     list_losses = []
     try:
         for epoch in range(1, epochs + 1) if epochs is not None else itertools.count(1):
             epoch_start_time = time.time()
             (
                 total_loss,
@@ -347,256 +334,3 @@ def train(
             torch.save(model.to("cpu"), output_path)
             print("Checkpoint stored at ", output_path)
         return total_loss, total_positional_losses, model.to("cpu"), dl
-def _parse_args(config_parser, parser):
-    # Do we have a config file to parse?
-    args_config, remaining = config_parser.parse_known_args()
-    if args_config.config:
-        with open(args_config.config, "r") as f:
-            cfg = yaml.safe_load(f)
-            parser.set_defaults(**cfg)
-    # The main arg parser parses the rest of the args, the usual
-    # defaults will have been overridden if config file specified.
-    args = parser.parse_args(remaining)
-    # Cache the args as a text string to save them in the output dir later
-    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
-    return args, args_text
-if __name__ == "__main__":
-    config_parser = argparse.ArgumentParser(
-        description="Only used as a first parser for the config file path."
-    )
-    config_parser.add_argument("--config")
-    parser = argparse.ArgumentParser()
-    parser.add_argument("prior")
-    parser.add_argument("--loss_function", default="barnll")
-    # Optional Arg's for `--loss_function barnll`
-    parser.add_argument(
-        "--min_y",
-        type=float,
-        help="barnll can only model y in strict ranges, this is the minimum y can take.",
-    )
-    parser.add_argument(
-        "--max_y",
-        type=float,
-        help="barnll can only model y in strict ranges, this is the maximum y can take.",
-    )
-    parser.add_argument("--num_buckets", default=100, type=int)
-    # parser.add_argument('--num_features', default=None, type=int, help='Specify depending on the prior.')
-    parser.add_argument(
-        "--extra_prior_kwargs_dict",
-        default={},
-        dest="extra_prior_kwargs_dict",
-        action=StoreDictKeyPair,
-        nargs="+",
-        metavar="KEY=VAL",
-        help="Specify depending on the prior.",
-    )
-    parser.add_argument(
-        "--encoder", default="linear", type=str, help="Specify depending on the prior."
-    )
-    parser.add_argument(
-        "--y_encoder",
-        default="linear",
-        type=str,
-        help="Specify depending on the prior. You should specify this if you do not fuse x and y.",
-    )
-    parser.add_argument(
-        "--pos_encoder",
-        default="none",
-        type=str,
-        help="Specify depending on the prior.",
-    )
-    parser.add_argument("--bptt", default=10, type=int)
-    parser.add_argument("--epochs", default=200, type=int)
-    parser.add_argument("--warmup_epochs", default=50, type=int)
-    parser.add_argument("--validation_period", default=10, type=int)
-    parser.add_argument(
-        "--permutation_invariant_max_eval_pos",
-        default=None,
-        type=int,
-        help="Set this to an int to ",
-    )
-    parser.add_argument(
-        "--permutation_invariant_sampling",
-        default="weighted",
-        help="Only relevant if --permutation_invariant_max_eval_pos is set.",
-    )
-    parser.add_argument("--train_mixed_precision", action="store_true")
-    # these can likely be mostly left at defaults
-    parser.add_argument(
-        "--emsize", default=512, type=int
-    )  # sometimes even larger is better e.g. 1024
-    parser.add_argument("--nlayers", default=6, type=int)
-    parser.add_argument("--nhid", default=None, type=int)  # 2*emsize is the default
-    parser.add_argument(
-        "--nhead", default=4, type=int
-    )  # nhead = emsize / 64 in the original paper
-    parser.add_argument("--dropout", default=0.0, type=float)
-    parser.add_argument("--steps_per_epoch", default=10, type=int)
-    parser.add_argument("--batch_size", default=1000, type=int)
-    parser.add_argument(
-        "--lr", "--learning_rate", default=0.001, type=float
-    )  # try also .0003, .0001, go lower with lower batch size
-    parser.add_argument("--gpu_device", default="cuda", type=str)
-    # for model checkpointing
-    parser.add_argument(
-        "--checkpoint_file",
-        help="absolute or relative-to-the-project-rootdir path to the file storing the state dicts.",
-        default=None,
-        type=str,
-    )
-    parser.add_argument("--saving_period", default=10, type=str)
-    args, _ = _parse_args(config_parser, parser)
-    if args.nhid is None:
-        args.nhid = 2 * args.emsize
-    prior = args.__dict__.pop("prior")
-    if prior == "gp":
-        prior = priors.fast_gp.DataLoader
-    elif prior == "ridge":
-        prior = priors.ridge.DataLoader
-    elif prior == "stroke":
-        prior = priors.stroke.DataLoader
-    elif prior == "mix_gp":
-        prior = priors.fast_gp_mix.DataLoader
-    else:
-        raise NotImplementedError(f"Prior == {prior}.")
-    loss_function = args.__dict__.pop("loss_function")
-    criterion = nn.GaussianNLLLoss(reduction="none", full=True)
-    classificiation_criterion = nn.CrossEntropyLoss(reduction="none")
-    num_buckets = args.__dict__.pop("num_buckets")
-    max_y = args.__dict__.pop("max_y")
-    min_y = args.__dict__.pop("min_y")
-    # criterion = nn.MSELoss(reduction='none')
-    device = args.gpu_device if torch.cuda.is_available() else "cpu:0"
-    def get_y_sample():
-        args.__dict__["extra_prior_kwargs_dict"]["eval_pos_seq_len_sampler"] = lambda: (
-            args.bptt,
-            args.bptt,
-        )
-        dl = prior(
-            num_steps=1,
-            batch_size=args.batch_size * args.steps_per_epoch,
-            seq_len=args.bptt,
-            device=device,
-            **args.extra_prior_kwargs_dict,
-        )
-        args.__dict__["extra_prior_kwargs_dict"].pop("eval_pos_seq_len_sampler")
-        y_sample = next(iter(dl))[-2]
-        print(
-            f"Creating Bar distribution with borders from y sample of size {y_sample.numel()}"
-        )
-        return y_sample
-    if loss_function == "ce":
-        criterion = nn.CrossEntropyLoss(reduction="none")
-    elif loss_function == "gaussnll":
-        criterion = nn.GaussianNLLLoss(reduction="none", full=True)
-    elif loss_function == "mse":
-        criterion = nn.MSELoss(reduction="none")
-    elif loss_function == "barnll":
-        criterion = BarDistribution(
-            borders=get_bucket_limits(num_buckets, full_range=(min_y, max_y))
-        )
-    elif loss_function == "adaptivebarnll":
-        borders = get_bucket_limits(
-            num_buckets, ys=get_y_sample(), full_range=(min_y, max_y)
-        )
-        criterion = BarDistribution(borders=borders)
-    elif loss_function == "adaptivefullsupportbarnll":
-        assert (
-            min_y is None and max_y is None
-        ), "Please do not specify `min_y` and `max_y` with `unboundedadaptivebarnll`."
-        borders = get_bucket_limits(num_buckets, ys=get_y_sample())
-        criterion = FullSupportBarDistribution(borders=borders)
-    else:
-        raise NotImplementedError(f"loss_function == {loss_function}.")
-    encoder = args.__dict__.pop("encoder")
-    y_encoder = args.__dict__.pop("y_encoder")
-    def get_encoder_generator(encoder):
-        if encoder == "linear":
-            encoder_generator = encoders.Linear
-        elif encoder == "mlp":
-            encoder_generator = encoders.MLP
-        elif encoder == "positional":
-            encoder_generator = encoders.Positional
-        else:
-            raise NotImplementedError(f"A {encoder} encoder is not valid.")
-        return encoder_generator
-    encoder_generator = get_encoder_generator(encoder)
-    y_encoder_generator = get_encoder_generator(y_encoder)
-    pos_encoder = args.__dict__.pop("pos_encoder")
-    if pos_encoder == "none":
-        pos_encoder_generator = None
-    elif pos_encoder == "sinus":
-        pos_encoder_generator = positional_encodings.PositionalEncoding
-    elif pos_encoder == "learned":
-        pos_encoder_generator = positional_encodings.LearnedPositionalEncoding
-    elif pos_encoder == "paired_scrambled_learned":
-        pos_encoder_generator = positional_encodings.PairedScrambledPositionalEncodings
-    else:
-        raise NotImplementedError(f"pos_encoer == {pos_encoder} is not valid.")
-    permutation_invariant_max_eval_pos = args.__dict__.pop(
-        "permutation_invariant_max_eval_pos"
-    )
-    permutation_invariant_sampling = args.__dict__.pop("permutation_invariant_sampling")
-    if permutation_invariant_max_eval_pos is not None:
-        if permutation_invariant_sampling == "weighted":
-            get_sampler = get_weighted_single_eval_pos_sampler
-        elif permutation_invariant_sampling == "uniform":
-            get_sampler = get_uniform_single_eval_pos_sampler
-        else:
-            raise ValueError()
-        args.__dict__["single_eval_pos_gen"] = get_sampler(
-            permutation_invariant_max_eval_pos
-        )
-    print("ARGS for `train`:", args.__dict__)
-    if args.__dict__["checkpoint_file"] is not None:
-        rootdir = os.path.dirname(os.path.realpath(__file__))
-        args.__dict__["checkpoint_file"] = os.path.join(
-            rootdir, args.__dict__["checkpoint_file"]
-        )
-        if os.path.exists(args.__dict__["checkpoint_file"]):
-            state_dicts = torch.load(args.__dict__["checkpoint_file"])
-            args.__dict__["load_weights_from_this_state_dict"] = state_dicts[
-                "model_state_dict"
-            ]
-            args.__dict__["load_optimizer_from_this_state_dict"] = state_dicts[
-                "optimizer_state_dict"
-            ]
-        else:
-            args.__dict__["load_weights_from_this_state_dict"] = None
-            args.__dict__["load_optimizer_from_this_state_dict"] = None
-    train(
-        prior,
-        criterion,
-        encoder_generator,
-        y_encoder_generator=y_encoder_generator,
-        pos_encoder_generator=pos_encoder_generator,
-        **args.__dict__,
-    )

 import itertools
 import time
 from contextlib import nullcontext
 import torch
 from torch import nn
 from lcpfn.transformer import TransformerModel
 from lcpfn.bar_distribution import (
     BarDistribution,
 )
 from lcpfn.utils import (
     get_cosine_schedule_with_warmup,
     get_openai_lr,
 )
 from lcpfn import positional_encodings
 from lcpfn.utils import init_dist
 from torch.cuda.amp import autocast, GradScaler
     list_losses = []
     try:
         for epoch in range(1, epochs + 1) if epochs is not None else itertools.count(1):
             epoch_start_time = time.time()
             (
                 total_loss,
             torch.save(model.to("cpu"), output_path)
             print("Checkpoint stored at ", output_path)
         return total_loss, total_positional_losses, model.to("cpu"), dl

lcpfn/train_lcpfn.py CHANGED Viewed

@@ -2,9 +2,11 @@ import math
 from torch import nn
-from lcpfn import bar_distribution, encoders, priors, train
 from lcpfn import utils
 def train_lcpfn(
     get_batch_func,
@@ -12,7 +14,7 @@ def train_lcpfn(
     emsize: int = 512,
     nlayers: int = 12,
     num_borders: int = 1000,
-    lr: float = 0.001,
     batch_size: int = 100,
     epochs: int = 1000,
 ):
@@ -25,7 +27,7 @@ def train_lcpfn(
         emsize (int, optional): The size of the embedding layer. Defaults to 512.
         nlayers (int, optional): The number of layers in the model. Defaults to 12.
         num_borders_choices (int, optional): The number of borders to use. Defaults to 1000.
-        lr (float, optional): The learning rate for the optimizer. Defaults to 0.001.
         batch_size (int, optional): The batch size for training. Defaults to 100.
         epochs (int, optional): The number of epochs to train for. Defaults to 1000.
@@ -36,7 +38,7 @@ def train_lcpfn(
     hps = {}
     # PFN training hyperparameters
-    dataloader = priors.utils.get_batch_to_dataloader(get_batch_func)  # type: ignore
     num_features = 1
@@ -82,7 +84,9 @@ def train_lcpfn(
         epochs=epochs,
         lr=lr,
         bptt=seq_len,
-        single_eval_pos_gen=utils.get_uniform_single_eval_pos_sampler(seq_len, min_len=1),
         aggregate_k_gradients=1,
         nhid=(emsize * 2),
         steps_per_epoch=100,

 from torch import nn
+from lcpfn import bar_distribution, encoders, train
 from lcpfn import utils
+from lcpfn.priors import utils as putils
 def train_lcpfn(
     get_batch_func,
     emsize: int = 512,
     nlayers: int = 12,
     num_borders: int = 1000,
+    lr: float = 0.0001,
     batch_size: int = 100,
     epochs: int = 1000,
 ):
         emsize (int, optional): The size of the embedding layer. Defaults to 512.
         nlayers (int, optional): The number of layers in the model. Defaults to 12.
         num_borders_choices (int, optional): The number of borders to use. Defaults to 1000.
+        lr (float, optional): The learning rate for the optimizer. Defaults to 0.0001.
         batch_size (int, optional): The batch size for training. Defaults to 100.
         epochs (int, optional): The number of epochs to train for. Defaults to 1000.
     hps = {}
     # PFN training hyperparameters
+    dataloader = putils.get_batch_to_dataloader(get_batch_func)  # type: ignore
     num_features = 1
         epochs=epochs,
         lr=lr,
         bptt=seq_len,
+        single_eval_pos_gen=utils.get_uniform_single_eval_pos_sampler(
+            seq_len, min_len=1
+        ),
         aggregate_k_gradients=1,
         nhid=(emsize * 2),
         steps_per_epoch=100,

lcpfn/transformer.py CHANGED Viewed

@@ -4,35 +4,74 @@ from typing import Optional
 import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.nn import Module, TransformerEncoder
 from lcpfn.layer import TransformerEncoderLayer, _get_activation_fn
 from lcpfn.utils import SeqBN, bool_mask_to_att_mask
 class TransformerModel(nn.Module):
-    def __init__(self, encoder, n_out, ninp, nhead, nhid, nlayers, dropout=0.0, style_encoder=None, y_encoder=None,
-                 pos_encoder=None, decoder=None, input_normalization=False, init_method=None, pre_norm=False,
-                 activation='gelu', recompute_attn=False, num_global_att_tokens=0, full_attention=False,
-                 all_layers_same_init=True):
         super().__init__()
-        self.model_type = 'Transformer'
-        encoder_layer_creator = lambda: TransformerEncoderLayer(ninp, nhead, nhid, dropout, activation=activation,
-                                                                pre_norm=pre_norm, recompute_attn=recompute_attn)
-        self.transformer_encoder = TransformerEncoder(encoder_layer_creator(), nlayers)\
-            if all_layers_same_init else TransformerEncoderDiffInit(encoder_layer_creator, nlayers)
         self.ninp = ninp
         self.encoder = encoder
         self.y_encoder = y_encoder
         self.pos_encoder = pos_encoder
-        self.decoder = decoder(ninp, nhid, n_out) if decoder is not None else nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, n_out))
         self.input_ln = SeqBN(ninp) if input_normalization else None
         self.style_encoder = style_encoder
         self.init_method = init_method
         if num_global_att_tokens is not None:
             assert not full_attention
-        self.global_att_embeddings = nn.Embedding(num_global_att_tokens, ninp) if num_global_att_tokens else None
         self.full_attention = full_attention
         self.n_out = n_out
@@ -47,37 +86,49 @@ class TransformerModel(nn.Module):
     @staticmethod
     def generate_D_q_matrix(sz, query_size):
-        train_size = sz-query_size
-        mask = torch.zeros(sz,sz) == 0
-        mask[:,train_size:].zero_()
         mask |= torch.eye(sz) == 1
         return bool_mask_to_att_mask(mask)
     @staticmethod
-    def generate_global_att_query_matrix(num_global_att_tokens, seq_len, num_query_tokens):
         train_size = seq_len + num_global_att_tokens - num_query_tokens
         sz = seq_len + num_global_att_tokens
         mask = torch.zeros(num_query_tokens, sz) == 0
-        mask[:,train_size:].zero_()
-        mask[:,train_size:] |= torch.eye(num_query_tokens) == 1
         return bool_mask_to_att_mask(mask)
     @staticmethod
-    def generate_global_att_trainset_matrix(num_global_att_tokens, seq_len, num_query_tokens):
         train_size = seq_len + num_global_att_tokens - num_query_tokens
         trainset_size = seq_len - num_query_tokens
         mask = torch.zeros(trainset_size, num_global_att_tokens) == 0
-        #mask[:,num_global_att_tokens:].zero_()
-        #mask[:,num_global_att_tokens:] |= torch.eye(trainset_size) == 1
         return bool_mask_to_att_mask(mask)
     @staticmethod
-    def generate_global_att_globaltokens_matrix(num_global_att_tokens, seq_len, num_query_tokens):
-        mask = torch.zeros(num_global_att_tokens, num_global_att_tokens+seq_len-num_query_tokens) == 0
         return bool_mask_to_att_mask(mask)
     def init_weights(self):
-        initrange = 1.
         # if isinstance(self.encoder,EmbeddingEncoder):
         #    self.encoder.weight.data.uniform_(-initrange, initrange)
         # self.decoder.bias.data.zero_()
@@ -87,41 +138,74 @@ class TransformerModel(nn.Module):
         for layer in self.transformer_encoder.layers:
             nn.init.zeros_(layer.linear2.weight)
             nn.init.zeros_(layer.linear2.bias)
-            attns = layer.self_attn if isinstance(layer.self_attn, nn.ModuleList) else [layer.self_attn]
             for attn in attns:
                 nn.init.zeros_(attn.out_proj.weight)
                 nn.init.zeros_(attn.out_proj.bias)
     def forward(self, src, src_mask=None, single_eval_pos=None):
-        assert isinstance(src, tuple), 'inputs (src) have to be given as (x,y) or (style,x,y) tuple'
-        if len(src) == 2: # (x,y) and no style
             src = (None,) + src
         style_src, style_src_size = (src[0], (0 if (src[0] is None) else 1))
-        if src_mask is not None: assert self.global_att_embeddings is None or isinstance(src_mask, tuple)
         if src_mask is None:
             x_src = src[1]
             if self.global_att_embeddings is None:
                 full_len = len(x_src) + style_src_size
                 if self.full_attention:
-                    src_mask = bool_mask_to_att_mask(torch.ones((full_len, full_len), dtype=torch.bool)).to(x_src.device)
                 else:
-                    src_mask = self.generate_D_q_matrix(len(x_src) + style_src_size, len(x_src) + style_src_size -single_eval_pos).to(x_src.device)
             else:
-                src_mask_args = (self.global_att_embeddings.num_embeddings,
-                                 len(x_src) + style_src_size,
-                                 len(x_src) + style_src_size - single_eval_pos)
-                src_mask = (self.generate_global_att_globaltokens_matrix(*src_mask_args).to(x_src.device),
-                            self.generate_global_att_trainset_matrix(*src_mask_args).to(x_src.device),
-                            self.generate_global_att_query_matrix(*src_mask_args).to(x_src.device))
         style_src, x_src, y_src = src
         x_src = self.encoder(x_src)
-        y_src = self.y_encoder(y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src)
-        style_src = self.style_encoder(style_src).unsqueeze(0) if self.style_encoder else torch.tensor([], device=x_src.device)
-        global_src = torch.tensor([], device=x_src.device) if self.global_att_embeddings is None else \
-            self.global_att_embeddings.weight.unsqueeze(1).repeat(1, x_src.shape[1], 1)
         train_x = x_src[:single_eval_pos] + y_src[:single_eval_pos]
         src = torch.cat([global_src, style_src, train_x, x_src[single_eval_pos:]], 0)
@@ -134,16 +218,29 @@ class TransformerModel(nn.Module):
         # If we have style input, drop its output
         output = self.transformer_encoder(src, src_mask)[style_src_size:]
         output = self.decoder(output)
-        return output[single_eval_pos+(self.global_att_embeddings.num_embeddings if self.global_att_embeddings else 0):]
     @torch.no_grad()
     def init_from_small_model(self, small_model):
-        assert isinstance(self.decoder, nn.Linear) and isinstance(self.encoder, (nn.Linear, nn.Sequential)) \
-               and isinstance(self.y_encoder, (nn.Linear, nn.Sequential))
         def set_encoder_weights(my_encoder, small_model_encoder):
-            my_encoder_linear, small_encoder_linear = (my_encoder, small_model_encoder) \
-                if isinstance(my_encoder, nn.Linear) else (my_encoder[-1], small_model_encoder[-1])
             small_in_dim = small_encoder_linear.out_features
             my_encoder_linear.weight.zero_()
             my_encoder_linear.bias.zero_()
@@ -158,7 +255,9 @@ class TransformerModel(nn.Module):
         self.decoder.weight[:, :small_in_dim] = small_model.decoder.weight
         self.decoder.bias = small_model.decoder.bias
-        for my_layer, small_layer in zip(self.transformer_encoder.layers, small_model.transformer_encoder.layers):
             small_hid_dim = small_layer.linear1.out_features
             my_in_dim = my_layer.linear1.in_features
@@ -166,23 +265,36 @@ class TransformerModel(nn.Module):
             my_in_proj_w = my_layer.self_attn.in_proj_weight
             small_in_proj_w = small_layer.self_attn.in_proj_weight
-            my_in_proj_w.view(3, my_in_dim, my_in_dim)[:, :small_in_dim, :small_in_dim] = small_in_proj_w.view(3,
-                                                                                                               small_in_dim,
-                                                                                                               small_in_dim)
-            my_layer.self_attn.in_proj_bias.view(3, my_in_dim)[:,
-            :small_in_dim] = small_layer.self_attn.in_proj_bias.view(3, small_in_dim)
-            my_layer.self_attn.out_proj.weight[:small_in_dim, :small_in_dim] = small_layer.self_attn.out_proj.weight
-            my_layer.self_attn.out_proj.bias[:small_in_dim] = small_layer.self_attn.out_proj.bias
-            my_layer.linear1.weight[:small_hid_dim, :small_in_dim] = small_layer.linear1.weight
             my_layer.linear1.bias[:small_hid_dim] = small_layer.linear1.bias
-            my_layer.linear2.weight[:small_in_dim, :small_hid_dim] = small_layer.linear2.weight
             my_layer.linear2.bias[:small_in_dim] = small_layer.linear2.bias
-            my_layer.norm1.weight[:small_in_dim] = math.sqrt(small_in_dim / my_in_dim) * small_layer.norm1.weight
-            my_layer.norm2.weight[:small_in_dim] = math.sqrt(small_in_dim / my_in_dim) * small_layer.norm2.weight
             my_layer.norm1.bias[:small_in_dim] = small_layer.norm1.bias
             my_layer.norm2.bias[:small_in_dim] = small_layer.norm2.bias
@@ -196,15 +308,23 @@ class TransformerEncoderDiffInit(Module):
         num_layers: the number of sub-encoder-layers in the encoder (required).
         norm: the layer normalization component (optional).
     """
-    __constants__ = ['norm']
     def __init__(self, encoder_layer_creator, num_layers, norm=None):
         super().__init__()
-        self.layers = nn.ModuleList([encoder_layer_creator() for _ in range(num_layers)])
         self.num_layers = num_layers
         self.norm = norm
-    def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
         r"""Pass the input through the encoder layers in turn.
         Args:
@@ -218,7 +338,9 @@ class TransformerEncoderDiffInit(Module):
         output = src
         for mod in self.layers:
-            output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
         if self.norm is not None:
             output = self.norm(output)

 import torch
 import torch.nn as nn
 from torch import Tensor
+import torch.nn.functional as F
 from torch.nn import Module, TransformerEncoder
 from lcpfn.layer import TransformerEncoderLayer, _get_activation_fn
 from lcpfn.utils import SeqBN, bool_mask_to_att_mask
+class GELU(nn.Module):
+    def forward(self, input: Tensor) -> Tensor:
+        return F.gelu(input)
 class TransformerModel(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        n_out,
+        ninp,
+        nhead,
+        nhid,
+        nlayers,
+        dropout=0.0,
+        style_encoder=None,
+        y_encoder=None,
+        pos_encoder=None,
+        decoder=None,
+        input_normalization=False,
+        init_method=None,
+        pre_norm=False,
+        activation="gelu",
+        recompute_attn=False,
+        num_global_att_tokens=0,
+        full_attention=False,
+        all_layers_same_init=True,
+    ):
         super().__init__()
+        self.model_type = "Transformer"
+        encoder_layer_creator = lambda: TransformerEncoderLayer(
+            ninp,
+            nhead,
+            nhid,
+            dropout,
+            activation=activation,
+            pre_norm=pre_norm,
+            recompute_attn=recompute_attn,
+        )
+        self.transformer_encoder = (
+            TransformerEncoder(encoder_layer_creator(), nlayers)
+            if all_layers_same_init
+            else TransformerEncoderDiffInit(encoder_layer_creator, nlayers)
+        )
         self.ninp = ninp
         self.encoder = encoder
         self.y_encoder = y_encoder
         self.pos_encoder = pos_encoder
+        self.decoder = (
+            decoder(ninp, nhid, n_out)
+            if decoder is not None
+            else nn.Sequential(nn.Linear(ninp, nhid), GELU(), nn.Linear(nhid, n_out))
+        )
         self.input_ln = SeqBN(ninp) if input_normalization else None
         self.style_encoder = style_encoder
         self.init_method = init_method
         if num_global_att_tokens is not None:
             assert not full_attention
+        self.global_att_embeddings = (
+            nn.Embedding(num_global_att_tokens, ninp) if num_global_att_tokens else None
+        )
         self.full_attention = full_attention
         self.n_out = n_out
     @staticmethod
     def generate_D_q_matrix(sz, query_size):
+        train_size = sz - query_size
+        mask = torch.zeros(sz, sz) == 0
+        mask[:, train_size:].zero_()
         mask |= torch.eye(sz) == 1
         return bool_mask_to_att_mask(mask)
     @staticmethod
+    def generate_global_att_query_matrix(
+        num_global_att_tokens, seq_len, num_query_tokens
+    ):
         train_size = seq_len + num_global_att_tokens - num_query_tokens
         sz = seq_len + num_global_att_tokens
         mask = torch.zeros(num_query_tokens, sz) == 0
+        mask[:, train_size:].zero_()
+        mask[:, train_size:] |= torch.eye(num_query_tokens) == 1
         return bool_mask_to_att_mask(mask)
     @staticmethod
+    def generate_global_att_trainset_matrix(
+        num_global_att_tokens, seq_len, num_query_tokens
+    ):
         train_size = seq_len + num_global_att_tokens - num_query_tokens
         trainset_size = seq_len - num_query_tokens
         mask = torch.zeros(trainset_size, num_global_att_tokens) == 0
+        # mask[:,num_global_att_tokens:].zero_()
+        # mask[:,num_global_att_tokens:] |= torch.eye(trainset_size) == 1
         return bool_mask_to_att_mask(mask)
     @staticmethod
+    def generate_global_att_globaltokens_matrix(
+        num_global_att_tokens, seq_len, num_query_tokens
+    ):
+        mask = (
+            torch.zeros(
+                num_global_att_tokens,
+                num_global_att_tokens + seq_len - num_query_tokens,
+            )
+            == 0
+        )
         return bool_mask_to_att_mask(mask)
     def init_weights(self):
+        initrange = 1.0
         # if isinstance(self.encoder,EmbeddingEncoder):
         #    self.encoder.weight.data.uniform_(-initrange, initrange)
         # self.decoder.bias.data.zero_()
         for layer in self.transformer_encoder.layers:
             nn.init.zeros_(layer.linear2.weight)
             nn.init.zeros_(layer.linear2.bias)
+            attns = (
+                layer.self_attn
+                if isinstance(layer.self_attn, nn.ModuleList)
+                else [layer.self_attn]
+            )
             for attn in attns:
                 nn.init.zeros_(attn.out_proj.weight)
                 nn.init.zeros_(attn.out_proj.bias)
     def forward(self, src, src_mask=None, single_eval_pos=None):
+        assert isinstance(
+            src, tuple
+        ), "inputs (src) have to be given as (x,y) or (style,x,y) tuple"
+        if len(src) == 2:  # (x,y) and no style
             src = (None,) + src
         style_src, style_src_size = (src[0], (0 if (src[0] is None) else 1))
+        if src_mask is not None:
+            assert self.global_att_embeddings is None or isinstance(src_mask, tuple)
         if src_mask is None:
             x_src = src[1]
             if self.global_att_embeddings is None:
                 full_len = len(x_src) + style_src_size
                 if self.full_attention:
+                    src_mask = bool_mask_to_att_mask(
+                        torch.ones((full_len, full_len), dtype=torch.bool)
+                    ).to(x_src.device)
                 else:
+                    src_mask = self.generate_D_q_matrix(
+                        len(x_src) + style_src_size,
+                        len(x_src) + style_src_size - single_eval_pos,
+                    ).to(x_src.device)
             else:
+                src_mask_args = (
+                    self.global_att_embeddings.num_embeddings,
+                    len(x_src) + style_src_size,
+                    len(x_src) + style_src_size - single_eval_pos,
+                )
+                src_mask = (
+                    self.generate_global_att_globaltokens_matrix(*src_mask_args).to(
+                        x_src.device
+                    ),
+                    self.generate_global_att_trainset_matrix(*src_mask_args).to(
+                        x_src.device
+                    ),
+                    self.generate_global_att_query_matrix(*src_mask_args).to(
+                        x_src.device
+                    ),
+                )
         style_src, x_src, y_src = src
         x_src = self.encoder(x_src)
+        y_src = self.y_encoder(
+            y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src
+        )
+        style_src = (
+            self.style_encoder(style_src).unsqueeze(0)
+            if self.style_encoder
+            else torch.tensor([], device=x_src.device)
+        )
+        global_src = (
+            torch.tensor([], device=x_src.device)
+            if self.global_att_embeddings is None
+            else self.global_att_embeddings.weight.unsqueeze(1).repeat(
+                1, x_src.shape[1], 1
+            )
+        )
         train_x = x_src[:single_eval_pos] + y_src[:single_eval_pos]
         src = torch.cat([global_src, style_src, train_x, x_src[single_eval_pos:]], 0)
         # If we have style input, drop its output
         output = self.transformer_encoder(src, src_mask)[style_src_size:]
         output = self.decoder(output)
+        return output[
+            single_eval_pos
+            + (
+                self.global_att_embeddings.num_embeddings
+                if self.global_att_embeddings
+                else 0
+            ) :
+        ]
     @torch.no_grad()
     def init_from_small_model(self, small_model):
+        assert (
+            isinstance(self.decoder, nn.Linear)
+            and isinstance(self.encoder, (nn.Linear, nn.Sequential))
+            and isinstance(self.y_encoder, (nn.Linear, nn.Sequential))
+        )
         def set_encoder_weights(my_encoder, small_model_encoder):
+            my_encoder_linear, small_encoder_linear = (
+                (my_encoder, small_model_encoder)
+                if isinstance(my_encoder, nn.Linear)
+                else (my_encoder[-1], small_model_encoder[-1])
+            )
             small_in_dim = small_encoder_linear.out_features
             my_encoder_linear.weight.zero_()
             my_encoder_linear.bias.zero_()
         self.decoder.weight[:, :small_in_dim] = small_model.decoder.weight
         self.decoder.bias = small_model.decoder.bias
+        for my_layer, small_layer in zip(
+            self.transformer_encoder.layers, small_model.transformer_encoder.layers
+        ):
             small_hid_dim = small_layer.linear1.out_features
             my_in_dim = my_layer.linear1.in_features
             my_in_proj_w = my_layer.self_attn.in_proj_weight
             small_in_proj_w = small_layer.self_attn.in_proj_weight
+            my_in_proj_w.view(3, my_in_dim, my_in_dim)[
+                :, :small_in_dim, :small_in_dim
+            ] = small_in_proj_w.view(3, small_in_dim, small_in_dim)
+            my_layer.self_attn.in_proj_bias.view(3, my_in_dim)[:, :small_in_dim] = (
+                small_layer.self_attn.in_proj_bias.view(3, small_in_dim)
+            )
+            my_layer.self_attn.out_proj.weight[:small_in_dim, :small_in_dim] = (
+                small_layer.self_attn.out_proj.weight
+            )
+            my_layer.self_attn.out_proj.bias[:small_in_dim] = (
+                small_layer.self_attn.out_proj.bias
+            )
+            my_layer.linear1.weight[:small_hid_dim, :small_in_dim] = (
+                small_layer.linear1.weight
+            )
             my_layer.linear1.bias[:small_hid_dim] = small_layer.linear1.bias
+            my_layer.linear2.weight[:small_in_dim, :small_hid_dim] = (
+                small_layer.linear2.weight
+            )
             my_layer.linear2.bias[:small_in_dim] = small_layer.linear2.bias
+            my_layer.norm1.weight[:small_in_dim] = (
+                math.sqrt(small_in_dim / my_in_dim) * small_layer.norm1.weight
+            )
+            my_layer.norm2.weight[:small_in_dim] = (
+                math.sqrt(small_in_dim / my_in_dim) * small_layer.norm2.weight
+            )
             my_layer.norm1.bias[:small_in_dim] = small_layer.norm1.bias
             my_layer.norm2.bias[:small_in_dim] = small_layer.norm2.bias
         num_layers: the number of sub-encoder-layers in the encoder (required).
         norm: the layer normalization component (optional).
     """
+    __constants__ = ["norm"]
     def __init__(self, encoder_layer_creator, num_layers, norm=None):
         super().__init__()
+        self.layers = nn.ModuleList(
+            [encoder_layer_creator() for _ in range(num_layers)]
+        )
         self.num_layers = num_layers
         self.norm = norm
+    def forward(
+        self,
+        src: Tensor,
+        mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
         r"""Pass the input through the encoder layers in turn.
         Args:
         output = src
         for mod in self.layers:
+            output = mod(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
         if self.norm is not None:
             output = self.norm(output)

lcpfn/utils.py CHANGED Viewed

@@ -9,9 +9,12 @@ from torch import nn
 from torch.optim.lr_scheduler import LambdaLR
 import numpy as np
 # copied from huggingface
-def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
-    """ Create a schedule with a learning rate that decreases following the
     values of the cosine function between 0 and `pi * cycles` after a warmup
     period during which it increases linearly between 0 and 1.
     """
@@ -19,13 +22,20 @@ def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 # copied from huggingface
-def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
     """
     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
     a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
@@ -48,7 +58,9 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
         return max(
-            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
         )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
@@ -65,7 +77,9 @@ def get_weighted_single_eval_pos_sampler(max_len):
     where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer.
     :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
     """
-    return lambda: random.choices(range(max_len), [1 / (max_len - i) for i in range(max_len)])[0]
 def get_uniform_single_eval_pos_sampler(max_len, min_len=0):
@@ -95,19 +109,22 @@ def set_locals_in_self(locals):
     Especially useful right at the beginning of `__init__`.
     :param locals: `locals()`
     """
-    self = locals['self']
     for var_name, val in locals.items():
-        if var_name != 'self': setattr(self, var_name, val)
-default_device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
 # Copied from StackOverflow, but we do an eval on the values additionally
 class StoreDictKeyPair(argparse.Action):
     def __init__(self, option_strings, dest, nargs=None, **kwargs):
         self._nargs = nargs
-        super(StoreDictKeyPair, self).__init__(option_strings, dest, nargs=nargs, **kwargs)
     def __call__(self, parser, namespace, values, option_string=None):
         my_dict = {}
@@ -120,16 +137,20 @@ class StoreDictKeyPair(argparse.Action):
         setattr(namespace, self.dest, my_dict)
         print("dict values: {}".format(my_dict))
 def get_nan_value(v, set_value_to_nan=0.0):
     if random.random() < set_value_to_nan:
         return v
     else:
         return random.choice([-999, 0, 1, 999])
 def to_ranking(data):
-    x = (data >= data.unsqueeze(-3))
     x = x.sum(0)
     return x
 # TODO: Is there a better way to do this?
 #   1. Cmparing to unique elements: When all values are different we still get quadratic blowup
 #   2. Argsort(Argsort()) returns ranking, but with duplicate values there is an ordering which is problematic
@@ -137,49 +158,64 @@ def to_ranking(data):
 def to_ranking_low_mem(data):
     x = torch.zeros_like(data)
     for col in range(data.shape[-1]):
-        x_ = (data[:, :, col] >= data[:, :, col].unsqueeze(-2))
         x_ = x_.sum(0)
         x[:, :, col] = x_
     return x
 def nan_handling_missing_for_unknown_reason_value(set_value_to_nan=0.0):
-    return get_nan_value(float('nan'), set_value_to_nan)
 def nan_handling_missing_for_no_reason_value(set_value_to_nan=0.0):
-    return get_nan_value(float('-inf'), set_value_to_nan)
 def nan_handling_missing_for_a_reason_value(set_value_to_nan=0.0):
-    return get_nan_value(float('inf'), set_value_to_nan)
 def torch_nanmean(x, axis=0):
-    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(axis=axis)
     value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
     return value / num
 def torch_nanstd(x, axis=0):
-    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(axis=axis)
     value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
     mean = value / num
-    mean_broadcast = torch.repeat_interleave(mean.unsqueeze(axis), x.shape[axis], dim=axis)
-    return torch.sqrt(torch.nansum(torch.square(mean_broadcast - x), axis=axis) / (num - 1))
 def normalize_data(data, normalize_positions=-1):
     if normalize_positions > 0:
         mean = torch_nanmean(data[:normalize_positions], axis=0)
-        std = torch_nanstd(data[:normalize_positions], axis=0) + .000001
     else:
         mean = torch_nanmean(data, axis=0)
-        std = torch_nanstd(data, axis=0) + .000001
     data = (data - mean) / std
     data = torch.clip(data, min=-100, max=100)
     return data
 def remove_outliers(X, n_sigma=4):
     # Expects T, B, H
     assert len(X.shape) == 3, "X must be T,B,H"
-    #for b in range(X.shape[1]):
-        #for col in range(X.shape[2]):
     data = X
     data_mean, data_std = torch_nanmean(data, axis=0), torch_nanstd(data, axis=0)
     cut_off = data_std * n_sigma
@@ -187,17 +223,26 @@ def remove_outliers(X, n_sigma=4):
     data_clean = X[:].clone()
     data_clean[torch.logical_or(data > upper, data < lower)] = np.nan
-    data_mean, data_std = torch_nanmean(data_clean, axis=0), torch_nanstd(data_clean, axis=0)
     cut_off = data_std * n_sigma
     lower, upper = data_mean - cut_off, data_mean + cut_off
-    X = torch.maximum(-torch.log(1+torch.abs(X)) + lower, X)
-    X = torch.minimum(torch.log(1+torch.abs(X)) + upper, X)
-            # print(ds[1][data < lower, col], ds[1][data > upper, col], ds[1][~np.isnan(data), col].shape, data_mean, data_std)
     return X
 def bool_mask_to_att_mask(mask):
-    return mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
 def print_on_master_only(is_master):
     import builtins as __builtin__
@@ -213,46 +258,152 @@ def print_on_master_only(is_master):
 def init_dist(device):
-    print('init dist')
-    if 'LOCAL_RANK' in os.environ:
         # launched with torch.distributed.launch
         rank = int(os.environ["LOCAL_RANK"])
-        print('torch.distributed.launch and my rank is', rank)
         torch.cuda.set_device(rank)
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
-        torch.distributed.init_process_group(backend="nccl", init_method="env://", timeout=datetime.timedelta(seconds=20),
-                                             world_size=torch.cuda.device_count(), rank=rank)
         torch.distributed.barrier()
         print_on_master_only(rank == 0)
-        print(f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
-              "only I can print, but when using print(..., force=True) it will print on all ranks.")
-        return True, rank, f'cuda:{rank}'
-    elif 'SLURM_PROCID' in os.environ and torch.cuda.device_count() > 1:
         # this is for multi gpu when starting with submitit
-        assert device != 'cpu:0'
-        rank = int(os.environ['SLURM_PROCID'])
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = '12355'
         torch.cuda.set_device(rank)
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
-        print('distributed submitit launch and my rank is', rank)
-        torch.distributed.init_process_group(backend="nccl", init_method="env://", timeout=datetime.timedelta(seconds=20),
-                                             world_size=torch.cuda.device_count(), rank=rank)
         torch.distributed.barrier()
         print_on_master_only(rank == 0)
-        print(f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
-              "only I can print, but when using print(..., force=True) it will print on all ranks.")
-        return True, rank, f'cuda:{rank}'
     else:
-        print('Not using distributed')
         # will not change any of the behavior of print, but allows putting the force=True in the print calls
         print_on_master_only(True)
         return False, 0, device
 def check_compatibility(dl):
-    if hasattr(dl, 'num_outputs'):
-        print('`num_outputs` for the DataLoader is deprecated. It is assumed to be 1 from now on.')
-        assert dl.num_outputs != 1, "We assume num_outputs to be 1. Instead of the num_ouputs change your loss." \
-                                    "We specify the number of classes in the CE loss."

 from torch.optim.lr_scheduler import LambdaLR
 import numpy as np
 # copied from huggingface
+def get_cosine_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1
+):
+    """Create a schedule with a learning rate that decreases following the
     values of the cosine function between 0 and `pi * cycles` after a warmup
     period during which it increases linearly between 0 and 1.
     """
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(
+            max(1, num_training_steps - num_warmup_steps)
+        )
+        return max(
+            0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+        )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 # copied from huggingface
+def get_linear_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, last_epoch=-1
+):
     """
     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
     a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
         return max(
+            0.0,
+            float(num_training_steps - current_step)
+            / float(max(1, num_training_steps - num_warmup_steps)),
         )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
     where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer.
     :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
     """
+    return lambda: random.choices(
+        range(max_len), [1 / (max_len - i) for i in range(max_len)]
+    )[0]
 def get_uniform_single_eval_pos_sampler(max_len, min_len=0):
     Especially useful right at the beginning of `__init__`.
     :param locals: `locals()`
     """
+    self = locals["self"]
     for var_name, val in locals.items():
+        if var_name != "self":
+            setattr(self, var_name, val)
+default_device = "cuda:0" if torch.cuda.is_available() else "cpu:0"
 # Copied from StackOverflow, but we do an eval on the values additionally
 class StoreDictKeyPair(argparse.Action):
     def __init__(self, option_strings, dest, nargs=None, **kwargs):
         self._nargs = nargs
+        super(StoreDictKeyPair, self).__init__(
+            option_strings, dest, nargs=nargs, **kwargs
+        )
     def __call__(self, parser, namespace, values, option_string=None):
         my_dict = {}
         setattr(namespace, self.dest, my_dict)
         print("dict values: {}".format(my_dict))
 def get_nan_value(v, set_value_to_nan=0.0):
     if random.random() < set_value_to_nan:
         return v
     else:
         return random.choice([-999, 0, 1, 999])
 def to_ranking(data):
+    x = data >= data.unsqueeze(-3)
     x = x.sum(0)
     return x
 # TODO: Is there a better way to do this?
 #   1. Cmparing to unique elements: When all values are different we still get quadratic blowup
 #   2. Argsort(Argsort()) returns ranking, but with duplicate values there is an ordering which is problematic
 def to_ranking_low_mem(data):
     x = torch.zeros_like(data)
     for col in range(data.shape[-1]):
+        x_ = data[:, :, col] >= data[:, :, col].unsqueeze(-2)
         x_ = x_.sum(0)
         x[:, :, col] = x_
     return x
 def nan_handling_missing_for_unknown_reason_value(set_value_to_nan=0.0):
+    return get_nan_value(float("nan"), set_value_to_nan)
 def nan_handling_missing_for_no_reason_value(set_value_to_nan=0.0):
+    return get_nan_value(float("-inf"), set_value_to_nan)
 def nan_handling_missing_for_a_reason_value(set_value_to_nan=0.0):
+    return get_nan_value(float("inf"), set_value_to_nan)
 def torch_nanmean(x, axis=0):
+    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(
+        axis=axis
+    )
     value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
     return value / num
 def torch_nanstd(x, axis=0):
+    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(
+        axis=axis
+    )
     value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
     mean = value / num
+    mean_broadcast = torch.repeat_interleave(
+        mean.unsqueeze(axis), x.shape[axis], dim=axis
+    )
+    return torch.sqrt(
+        torch.nansum(torch.square(mean_broadcast - x), axis=axis) / (num - 1)
+    )
 def normalize_data(data, normalize_positions=-1):
     if normalize_positions > 0:
         mean = torch_nanmean(data[:normalize_positions], axis=0)
+        std = torch_nanstd(data[:normalize_positions], axis=0) + 0.000001
     else:
         mean = torch_nanmean(data, axis=0)
+        std = torch_nanstd(data, axis=0) + 0.000001
     data = (data - mean) / std
     data = torch.clip(data, min=-100, max=100)
     return data
 def remove_outliers(X, n_sigma=4):
     # Expects T, B, H
     assert len(X.shape) == 3, "X must be T,B,H"
+    # for b in range(X.shape[1]):
+    # for col in range(X.shape[2]):
     data = X
     data_mean, data_std = torch_nanmean(data, axis=0), torch_nanstd(data, axis=0)
     cut_off = data_std * n_sigma
     data_clean = X[:].clone()
     data_clean[torch.logical_or(data > upper, data < lower)] = np.nan
+    data_mean, data_std = (
+        torch_nanmean(data_clean, axis=0),
+        torch_nanstd(data_clean, axis=0),
+    )
     cut_off = data_std * n_sigma
     lower, upper = data_mean - cut_off, data_mean + cut_off
+    X = torch.maximum(-torch.log(1 + torch.abs(X)) + lower, X)
+    X = torch.minimum(torch.log(1 + torch.abs(X)) + upper, X)
+    # print(ds[1][data < lower, col], ds[1][data > upper, col], ds[1][~np.isnan(data), col].shape, data_mean, data_std)
     return X
 def bool_mask_to_att_mask(mask):
+    return (
+        mask.float()
+        .masked_fill(mask == 0, float("-inf"))
+        .masked_fill(mask == 1, float(0.0))
+    )
 def print_on_master_only(is_master):
     import builtins as __builtin__
 def init_dist(device):
+    print("init dist")
+    if "LOCAL_RANK" in os.environ:
         # launched with torch.distributed.launch
         rank = int(os.environ["LOCAL_RANK"])
+        print("torch.distributed.launch and my rank is", rank)
         torch.cuda.set_device(rank)
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
+        torch.distributed.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            timeout=datetime.timedelta(seconds=20),
+            world_size=torch.cuda.device_count(),
+            rank=rank,
+        )
         torch.distributed.barrier()
         print_on_master_only(rank == 0)
+        print(
+            f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
+            "only I can print, but when using print(..., force=True) it will print on all ranks."
+        )
+        return True, rank, f"cuda:{rank}"
+    elif "SLURM_PROCID" in os.environ and torch.cuda.device_count() > 1:
         # this is for multi gpu when starting with submitit
+        assert device != "cpu:0"
+        rank = int(os.environ["SLURM_PROCID"])
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
         torch.cuda.set_device(rank)
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
+        print("distributed submitit launch and my rank is", rank)
+        torch.distributed.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            timeout=datetime.timedelta(seconds=20),
+            world_size=torch.cuda.device_count(),
+            rank=rank,
+        )
         torch.distributed.barrier()
         print_on_master_only(rank == 0)
+        print(
+            f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
+            "only I can print, but when using print(..., force=True) it will print on all ranks."
+        )
+        return True, rank, f"cuda:{rank}"
     else:
+        print("Not using distributed")
         # will not change any of the behavior of print, but allows putting the force=True in the print calls
         print_on_master_only(True)
         return False, 0, device
 def check_compatibility(dl):
+    if hasattr(dl, "num_outputs"):
+        print(
+            "`num_outputs` for the DataLoader is deprecated. It is assumed to be 1 from now on."
+        )
+        assert dl.num_outputs != 1, (
+            "We assume num_outputs to be 1. Instead of the num_ouputs change your loss."
+            "We specify the number of classes in the CE loss."
+        )
+def pfn_normalize(
+    lb=torch.tensor(float("-inf")),
+    ub=torch.tensor(float("inf")),
+    soft_lb=0.0,
+    soft_ub=1.0,
+    minimize=False,
+):
+    """
+    LC-PFN curve prior assumes curves to be normalized within the range [0,1] and to be maximized.
+    This function allows to normalize and denormalize data to fit this assumption.
+    Parameters:
+        lb (torch.Tensor): Lower bound of the data.
+        ub (torch.Tensor): Upper bound of the data.
+        soft_lb (float): Soft lower bound for normalization. Default is 0.0.
+        soft_ub (float): Soft upper bound for normalization. Default is 1.0.
+        minimize (bool): If True, the original curve is a minization. Default is False.
+    Returns: Two functions for normalizing and denormalizing the data.
+    """
+    assert lb <= soft_lb and soft_lb < soft_ub and soft_ub <= ub
+    # step 1: linearly transform [soft_lb,soft_ub] [-1,1] (where the sigmoid behaves approx linearly)
+    #    2.0/(soft_ub - soft_lb)*(x - soft_lb) - 1.0
+    # step 2: apply a vertically scaled/shifted the sigmoid such that [lb,ub] --> [0,1]
+    def cinv(x):
+        return 1 - x if minimize else x
+    def lin_soft(x):
+        return 2 / (soft_ub - soft_lb) * (x - soft_lb) - 1
+    def lin_soft_inv(y):
+        return (y + 1) / 2 * (soft_ub - soft_lb) + soft_lb
+    try:
+        if torch.exp(-lin_soft(lb)) > 1e300:
+            raise RuntimeError
+        # otherwise overflow causes issues, treat these cases as if the lower bound was -infinite
+        # print(f"WARNING: {lb} --> NINF to avoid overflows ({np.exp(-lin_soft(lb))})")
+    except RuntimeError:
+        lb = torch.tensor(float("-inf"))
+    if torch.isinf(lb) and torch.isinf(ub):
+        return lambda x: cinv(
+            1 / (1 + torch.exp(-lin_soft(x)))
+        ), lambda y: lin_soft_inv(torch.log(cinv(y) / (1 - cinv(y))))
+    elif torch.isinf(lb):
+        a = 1 + torch.exp(-lin_soft(ub))
+        return lambda x: cinv(
+            a / (1 + torch.exp(-lin_soft(x)))
+        ), lambda y: lin_soft_inv(torch.log((cinv(y) / a) / (1 - (cinv(y) / a))))
+    elif torch.isinf(ub):
+        a = 1 / (1 - 1 / (1 + torch.exp(-lin_soft(lb))))
+        b = 1 - a
+        return lambda x: cinv(
+            a / (1 + torch.exp(-lin_soft(x))) + b
+        ), lambda y: lin_soft_inv(
+            torch.log(((cinv(y) - b) / a) / (1 - ((cinv(y) - b) / a)))
+        )
+    else:
+        a = (
+            1
+            + torch.exp(-lin_soft(ub))
+            + torch.exp(-lin_soft(lb))
+            + torch.exp(-lin_soft(ub) - lin_soft(lb))
+        ) / (torch.exp(-lin_soft(lb)) - torch.exp(-lin_soft(ub)))
+        b = -a / (1 + torch.exp(-lin_soft(lb)))
+        return lambda x: cinv(
+            a / (1 + torch.exp(-lin_soft(x))) + b
+        ), lambda y: lin_soft_inv(
+            torch.log(((cinv(y) - b) / a) / (1 - ((cinv(y) - b) / a)))
+        )
+def get_default_normalizer():
+    default_normalizer_kwargs = {
+        "lb": torch.tensor(0.0),
+        "ub": torch.tensor(1.0),
+        "soft_lb": 0.0,
+        "soft_ub": 1.0,
+        "minimize": False,
+    }
+    return pfn_normalize(**default_normalizer_kwargs)
+def identity_normalizer():
+    return lambda x: x, lambda x: x

lcpfn/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.3"

pyproject.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+[project]
+name = "lcpfn"
+description = "In-context Bayesian Learning Curve Extrapolation"
+readme = {file = "readme.md", content-type = 'text/markdown'}
+license = {file = "LICENSE"}
+authors = [
+    {name = "Steven Adriaensen", email= "adriaens@cs.uni-freiburg.de"},
+    {name = "Herilalaina Rakotoarison", email = "rakotoah@cs.uni-freiburg.de"},
+    {name = "Samuel Müller", email = "muellesa@cs.uni-freiburg.de"},
+    {name = "Frank Hutter", email = "fh@cs.uni-freiburg.de"},
+]
+requires-python = ">=3.9,<3.12"
+dependencies = [
+    "torch<=1.11.0",
+    "numpy>=1.21.2,<2",
+    "requests>=2.23.0"
+]
+dynamic = ["version"]
+classifiers = [
+  'Intended Audience :: Science/Research',
+  'License :: OSI Approved :: MIT License',
+  'Programming Language :: Python',
+  'Topic :: Software Development',
+  'Topic :: Scientific/Engineering',
+  'Operating System :: Unix',
+  'Operating System :: MacOS',
+  'Programming Language :: Python :: 3',
+  'Programming Language :: Python :: 3.9',
+  'Programming Language :: Python :: 3.10',
+  'Programming Language :: Python :: 3.11',
+]
+[project.urls]
+homepage = "https://github.com/automl/lcpfn"
+repository = "https://github.com/automl/lcpfn"
+bugtracker = "https://github.com/automl/lcpfn/issues"
+[tool.setuptools.packages.find]
+include = ["lcpfn*"]
+[tool.setuptools.dynamic]
+version = {attr = "lcpfn.version.__version__"}

requirements.txt DELETED Viewed

@@ -1,4 +0,0 @@
-torch==1.11.0
-numpy>=1.21.2
-scikit-learn
-# lcpfn @ git+https://github.com/automl/lcpfn.git