Spaces:

herilalaina
/

lcpfn

Sleeping

App Files Files Community

herilalaina commited on Oct 15, 2024

Commit

328c052

1 Parent(s): b62776c

update

Browse files

Files changed (30) hide show

lcpfn/.ipynb_checkpoints/__init__-checkpoint.py +0 -53
lcpfn/.ipynb_checkpoints/curves-checkpoint.py +0 -277
lcpfn/.ipynb_checkpoints/domhan_prior-checkpoint.py +0 -195
lcpfn/__init__.py +0 -80
lcpfn/bar_distribution.py +0 -349
lcpfn/curves.py +0 -277
lcpfn/decoders.py +0 -42
lcpfn/domhan_prior.py +0 -199
lcpfn/encoders.py +0 -190
lcpfn/initializers.py +0 -11
lcpfn/layer.py +0 -179
lcpfn/model.py +0 -56
lcpfn/positional_encodings.py +0 -78
lcpfn/priors/__init__.py +0 -1
lcpfn/priors/binarized_regression.py +0 -19
lcpfn/priors/fast_gp.py +0 -143
lcpfn/priors/fast_gp_mix.py +0 -394
lcpfn/priors/gp.py +0 -69
lcpfn/priors/prior.py +0 -25
lcpfn/priors/pyro.py +0 -41
lcpfn/priors/ridge.py +0 -37
lcpfn/priors/stroke.py +0 -143
lcpfn/priors/utils.py +0 -151
lcpfn/train.py +0 -336
lcpfn/train_lcpfn.py +0 -96
lcpfn/transformer.py +0 -348
lcpfn/utils.py +0 -409
lcpfn/version.py +0 -1
pyproject.toml +0 -42
requirements.txt +3 -0

lcpfn/.ipynb_checkpoints/__init__-checkpoint.py DELETED Viewed

@@ -1,53 +0,0 @@
-import os, sys
-sys.path.insert(0, os.path.dirname(__file__))
-model_path = 'trained_models'
-def prepare_models():
-    pfns4bo_dir = os.path.dirname(__file__)
-    model_names = ['pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt',
-                   'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt']
-    for name in model_names:
-        weights_path = os.path.join(pfns4bo_dir, model_path, name)
-        compressed_weights_path = os.path.join(pfns4bo_dir, model_path, name + '.gz')
-        if not os.path.exists(weights_path):
-            if not os.path.exists(compressed_weights_path):
-                print("Downloading", os.path.abspath(compressed_weights_path))
-                import requests
-                url = f'https://github.com/automl/lcpfn/raw/main/lcpfn/trained_models/{name + ".gz"}'
-                r = requests.get(url, allow_redirects=True)
-                os.makedirs(os.path.dirname(compressed_weights_path), exist_ok=True)
-                with open(compressed_weights_path, 'wb') as f:
-                    f.write(r.content)
-            if os.path.exists(compressed_weights_path):
-                print("Unzipping", name)
-                os.system(f"gzip -dk {compressed_weights_path}")
-            else:
-                print("Failed to find", compressed_weights_path)
-                print("Make sure you have an internet connection to download the model automatically..")
-        if os.path.exists(weights_path):
-            print("Successfully located model at", weights_path)
-model_dict = {
-    'EMSIZE512_NLAYERS12_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
-                                              'pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt'),
-    'EMSIZE512_NLAYERS6_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
-                                    'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt'),
-}
-def __getattr__(name):
-    if name in model_dict:
-        if not os.path.exists(model_dict[name]):
-            print("Can't find", os.path.abspath(model_dict[name]), "thus unzipping/downloading models now.")
-            print("This might take a while..")
-            prepare_models()
-        return model_dict[name]
-    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
-from lcpfn.model import LCPFN
-from lcpfn.train_lcpfn import train_lcpfn
-from lcpfn.domhan_prior import sample_from_prior, create_get_batch_func

lcpfn/.ipynb_checkpoints/curves-checkpoint.py DELETED Viewed

@@ -1,277 +0,0 @@
-import numpy as np
-from collections import OrderedDict
-prior = {
-    "pow3": {
-        "uniform": OrderedDict(
-            a={"type": "uniform", "param1": -1, "param2": 1},
-            c={"type": "uniform", "param1": 0, "param2": 1},
-            alpha={"type": "uniform", "param1": 0, "param2": 1},
-        ),
-        "peaked": OrderedDict(
-            a={"type": "uniform", "param1": -0.6, "param2": 0.6},
-            c={"type": "uniform", "param1": 0, "param2": 1.25},
-            alpha={"type": "log_normal", "param1": 0, "param2": 2},
-        ),
-    },
-    "ilog2": {
-        "uniform": OrderedDict(
-            c={"type": "uniform", "param1": 0, "param2": 1},
-            a={"type": "uniform", "param1": -1, "param2": 1},
-        ),
-        "peaked": OrderedDict(
-            c={"type": "uniform", "param1": 0, "param2": 1},
-            a={"type": "uniform", "param1": -0.5, "param2": 0.5},
-        ),
-    },
-    "janoschek": {
-        "uniform": OrderedDict(
-            a={"type": "uniform", "param1": 0, "param2": 1},
-            beta={"type": "uniform", "param1": 0, "param2": 2},
-            k={"type": "uniform", "param1": 0, "param2": 1},
-            delta={"type": "uniform", "param1": -5, "param2": 5},
-        ),
-        "peaked": OrderedDict(
-            a={"type": "uniform", "param1": 0, "param2": 1},
-            beta={"type": "uniform", "param1": 0, "param2": 2},
-            k={"type": "log_normal", "param1": -2, "param2": 1},
-            delta={"type": "log_normal", "param1": 0, "param2": 0.5},
-        ),
-    },
-}
-def prior_sampler(rng, type, param1, param2):
-    if type == "uniform":
-        return rng.uniform(param1, param2)
-    elif type == "log_normal":
-        return rng.lognormal(param1, param2)
-    raise Exception("Unknown prior type: {}".format(type))
-def pow3(x, c, a, alpha):
-    return c - a * (x) ** (-alpha)
-def prior_pow3(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["pow3"]["peaked"][p]["type"],
-            param1=prior["pow3"]["peaked"][p]["param1"],
-            param2=prior["pow3"]["peaked"][p]["param2"],
-        )
-        for p in ["a", "c", "alpha"]
-    }
-def uniform_prior_pow3(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["pow3"]["uniform"][p]["type"],
-            param1=prior["pow3"]["uniform"][p]["param1"],
-            param2=prior["pow3"]["uniform"][p]["param2"],
-        )
-        for p in ["a", "c", "alpha"]
-    }
-def ilog2(x, c, a):
-    return c - a / (np.log(x + 1))
-def prior_ilog2(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["ilog2"]["peaked"][p]["type"],
-            param1=prior["ilog2"]["peaked"][p]["param1"],
-            param2=prior["ilog2"]["peaked"][p]["param2"],
-        )
-        for p in ["a", "c"]
-    }
-def uniform_prior_ilog2(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["ilog2"]["uniform"][p]["type"],
-            param1=prior["ilog2"]["uniform"][p]["param1"],
-            param2=prior["ilog2"]["uniform"][p]["param2"],
-        )
-        for p in ["a", "c"]
-    }
-def janoschek(x, a, beta, k, delta):
-    """
-    http://www.pisces-conservation.com/growthhelp/janoschek.htm
-    """
-    return a - (a - beta) * np.exp(-k * x**delta)
-def prior_janoschek(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["janoschek"]["peaked"][p]["type"],
-            param1=prior["janoschek"]["peaked"][p]["param1"],
-            param2=prior["janoschek"]["peaked"][p]["param2"],
-        )
-        for p in ["a", "beta", "k", "delta"]
-    }
-def uniform_prior_janoschek(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["janoschek"]["uniform"][p]["type"],
-            param1=prior["janoschek"]["uniform"][p]["param1"],
-            param2=prior["janoschek"]["uniform"][p]["param2"],
-        )
-        for p in ["a", "beta", "k", "delta"]
-    }
-def log_power(x, a, b, c):
-    # a: upper bound
-    # c: growth rate
-    # initial = a/ (1 + (1/e^b)^c
-    return a / (1.0 + (x / np.exp(b)) ** c)
-def prior_log_power(rng):
-    # a ~ N(0.8,0.1)
-    # b ~ N(1,1)
-    # c ~ U(-3,0)
-    a = rng.normal(0.8, 0.1)
-    b = rng.normal(1.0, 1.0)
-    c = rng.uniform(-3.0, 0.0)
-    return {"a": a, "b": b, "c": c}
-def weibull(x, alpha, beta, kappa, delta):
-    """
-    Weibull modell
-    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
-    alpha: upper asymptote
-    beta: lower asymptote
-    k: growth rate
-    delta: controls the x-ordinate for the point of inflection
-    """
-    return alpha - (alpha - beta) * np.exp(-((kappa * x) ** delta))
-def prior_weibull(rng):
-    alpha = rng.uniform(0.0, 1.5)
-    beta = rng.uniform(0.0, 1)
-    kappa = np.exp(rng.normal(-2.0, 1.0))
-    delta = np.exp(rng.normal(0, 0.5))
-    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
-def mmf(x, alpha, beta, kappa, delta):
-    """
-    Morgan-Mercer-Flodin
-    description:
-    Nonlinear Regression page 342
-    http://bit.ly/1jodG17
-    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
-    alpha: upper asymptote
-    kappa: growth rate
-    beta: initial value
-    delta: controls the point of inflection
-    """
-    return alpha - (alpha - beta) / (1.0 + (kappa * x) ** delta)
-def prior_mmf(rng):
-    # alpha ~ N(0.8,0.1)
-    # beta ~ N(0.2,0.1)
-    # ln(kappa) ~ N(0,2)
-    # ln(delta) ~ N(0,1)
-    alpha = rng.normal(0.8, 0.1)
-    beta = rng.normal(0.2, 0.1)
-    kappa = np.exp(rng.normal(0, 2))
-    delta = np.exp(rng.normal(0, 1))
-    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
-def vap(x, a, b, c):
-    """Vapor pressure model"""
-    # no upper bound if c > 0
-    # a = ln(upper bound) for c=0
-    # a+b = ln(initial)
-    return np.exp(a + b / x + c * np.log(x))
-def prior_vap(rng):
-    a = rng.uniform(-2.0, 0.0)  # @heri: range check
-    b = rng.uniform(-4.0, 0.0)  # @heri: range check
-    c = np.exp(rng.uniform(-8.0, 0.0))  # @heri: same as weights
-    return {"a": a, "b": b, "c": c}
-def loglog_linear(x, a, b):
-    x = np.log(x)
-    return np.log(a * x + b)
-def prior_loglog_linear(rng):
-    # ln(a) ~ N(-2, 1)
-    # ln(b) ~ U(0, 1)
-    a = np.exp(rng.normal(-2.0, 1.0))
-    b = np.exp(rng.uniform(0.0, 1.0))
-    return {"a": a, "b": b}
-def exp4(x, c, a, b, alpha):
-    return c - np.exp(-a * (x**alpha) + b)
-def prior_exp4(rng):
-    # c ~ N(0.8,0.1)
-    c = rng.normal(0.8, 0.1)
-    # ln(a) ~ N(-2,1)
-    a = np.exp(rng.normal(-2, 1))
-    # ln(alpha) ~ N(0,1)
-    alpha = np.exp(rng.normal(0, 1))
-    # ln(b) ~ N(0,0.5)
-    b = np.exp(rng.normal(0, 0.5))
-    return {"a": a, "b": b, "c": c, "alpha": alpha}
-def pow4(x, c, a, b, alpha):
-    return c - (a * x + b) ** -alpha
-def prior_pow4(rng):
-    # ln(1 - c) ~ U(-5, 0)
-    c = 1 - np.exp(rng.uniform(-5.0, 0))
-    # ln(a) ~ N(-3, 2)
-    a = np.exp(rng.normal(-3.0, 2))
-    # ln(alpha) ~ N(0,1)
-    alpha = np.exp(rng.normal(0, 1))
-    # ln(b) ~ U(0, 1)
-    b = np.exp(rng.uniform(0, 1))
-    return {"a": a, "b": b, "c": c, "alpha": alpha}
-def dr_hill_zero_background(x, theta, eta, kappa):
-    # theta: upper bound
-    # eta: growth rate
-    # initial = theta/(kappa^eta + 1)
-    return (theta * x**eta) / (kappa**eta + x**eta)
-def prior_dr_hill_zero_background(rng):
-    # theta ~ U(1,0) N(0.8,0.1)
-    # ln(eta) ~ N(1,1)
-    # ln(kappa) ~ N(1,2)
-    theta = rng.normal(0.8, 0.1)
-    eta = np.exp(rng.normal(1.0, 1.0))
-    kappa = np.exp(rng.normal(1.0, 2.0))
-    return {"theta": theta, "eta": eta, "kappa": kappa}

lcpfn/.ipynb_checkpoints/domhan_prior-checkpoint.py DELETED Viewed

@@ -1,195 +0,0 @@
-from functools import partial
-import torch
-import numpy as np
-from lcpfn.curves import (
-    pow3,
-    ilog2,
-    janoschek,
-    log_power,
-    prior_ilog2,
-    uniform_prior_pow3,
-    weibull,
-    mmf,
-    vap,
-    loglog_linear,
-    exp4,
-    pow4,
-    dr_hill_zero_background,
-)
-from lcpfn.curves import (
-    prior_pow3,
-    prior_janoschek,
-    prior_log_power,
-    prior_weibull,
-    prior_mmf,
-    prior_vap,
-    prior_loglog_linear,
-    prior_exp4,
-    prior_pow4,
-    prior_dr_hill_zero_background,
-)
-from lcpfn.curves import (
-    uniform_prior_pow3,
-    uniform_prior_ilog2,
-    uniform_prior_janoschek,
-)
-def prior_weights(
-    rng,
-    components=[
-        "pow3",
-        "ilog2",
-        "janoschek",
-        "log_power",
-        "weibull",
-        "mmf",
-        "vap",
-        "loglog_linear",
-        "exp4",
-        "pow4",
-        "dr_hill_zero_background",
-    ],
-):
-    K = len(components)
-    weights = rng.uniform(0.0, 1, size=(K,))
-    return {f: weights[i] for i, f in enumerate(components)}
-def sample_from_prior(rng, seq_len=100):
-    return sample_prior_comb(
-        rng=rng, seq_len=seq_len, components=["pow3", "ilog2", "janoschek"], distribution="peaked"
-    )
-def sample_prior_comb(
-    rng,
-    components,
-    distribution,
-    var_lnloc=-4,
-    var_lnscale=1,
-    range_constraint=True,
-    seq_len=100,
-):
-    f_components = {
-        "pow3": pow3,
-        "ilog2": ilog2,
-        "janoschek": janoschek,
-        "log_power": log_power,
-        "weibull": weibull,
-        "mmf": mmf,
-        "vap": vap,
-        "loglog_linear": loglog_linear,
-        "exp4": exp4,
-        "pow4": pow4,
-        "dr_hill_zero_background": dr_hill_zero_background,
-    }
-    if distribution == "peaked":
-        f_priors = {
-            "pow3": prior_pow3,
-            "ilog2": prior_ilog2,
-            "janoschek": prior_janoschek,
-            "log_power": prior_log_power,
-            "weibull": prior_weibull,
-            "mmf": prior_mmf,
-            "vap": prior_vap,
-            "loglog_linear": prior_loglog_linear,
-            "exp4": prior_exp4,
-            "pow4": prior_pow4,
-            "dr_hill_zero_background": prior_dr_hill_zero_background,
-        }
-    elif distribution == "uniform":
-        f_priors = {
-            "pow3": uniform_prior_pow3,
-            "ilog2": uniform_prior_ilog2,
-            "janoschek": uniform_prior_janoschek
-        }
-    else:
-        raise NotImplemented()
-    x = np.arange(1, seq_len + 1)
-    while True:
-        # sample the noiseless curve
-        weights = prior_weights(rng, components=components)
-        y = np.zeros(x.shape, dtype="float")
-        kwargs = 0
-        for f, w in weights.items():
-            kwargs = f_priors[f](rng)
-            # print(f_components[f](x, **kwargs))
-            y += w * f_components[f](x, **kwargs)
-        # add noise (can exceed [0,1], but afaik no way to implement this prior in Tobis work)
-        var = np.exp(
-            rng.normal(var_lnloc, var_lnscale)
-        )  # @heri: ln_prob =+ log(normal.pdf(log(var), loc=var_lnloc, scale=var_lnscale))
-        # reject any curves that are non-increasing, exceed the [0,1] range
-        if (
-            y[-1] <= y[0]
-            or (range_constraint and (np.any(y < 0) or np.any(y > 1)))
-            or np.isnan(y).any()
-        ):
-            continue
-        else:
-            break
-    def curve():  # generates a sample from the same model, but with independent noise
-        y_noisy = y + rng.normal(np.zeros_like(y), var)
-        return y, y_noisy
-    return curve
-def generate_prior_dataset(n, prior=sample_prior_comb, seed=42):
-    """
-    Returns a fixed sample from the prior (with fixed seq_len) as an n x seq_len np.ndarray
-    """
-    rng = np.random.RandomState(seed)
-    prior_data = np.stack([prior(rng)()[1] for _ in range(n)])
-    return prior_data
-def create_get_batch_func(prior):
-    return partial(get_batch_domhan, prior=prior)
-# function producing batches for PFN training
-def get_batch_domhan(
-    batch_size,
-    seq_len,
-    num_features,
-    prior,
-    device="cpu",
-    noisy_target=True,
-    **_,
-):
-    assert num_features == 1
-    x = np.arange(1, seq_len + 1)
-    y_target = np.empty((batch_size, seq_len), dtype=float)
-    y_noisy = np.empty((batch_size, seq_len), dtype=float)
-    for i in range(batch_size):
-        curve_func = prior(np.random, seq_len=seq_len)  # uses numpy rng
-        if noisy_target:
-            _, y_noisy[i] = curve_func()
-            y_target[i] = y_noisy[i]
-        else:
-            y_target[i], y_noisy[i] = curve_func()
-    # turn numpy arrays into correctly shaped torch tensors & move them to device
-    x = (
-        torch.arange(1, seq_len + 1)
-        .repeat((num_features, batch_size, 1))
-        .transpose(2, 0)
-        .to(device)
-    )
-    y_target = torch.from_numpy(y_target).transpose(1, 0).to(device)
-    y_noisy = torch.from_numpy(y_noisy).transpose(1, 0).to(device)
-    # changes
-    x = x.float()
-    y_target = y_target.float()
-    y_noisy = y_noisy.float()
-    return x, y_noisy, y_target

lcpfn/__init__.py DELETED Viewed

@@ -1,80 +0,0 @@
-import os, sys
-sys.path.insert(0, os.path.dirname(__file__))
-model_path = "trained_models"
-def prepare_models():
-    pfns4bo_dir = os.path.dirname(__file__)
-    model_names = [
-        "pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt",
-        "pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt",
-    ]
-    for name in model_names:
-        weights_path = os.path.join(pfns4bo_dir, model_path, name)
-        compressed_weights_path = os.path.join(pfns4bo_dir, model_path, name + ".gz")
-        if not os.path.exists(weights_path):
-            if not os.path.exists(compressed_weights_path):
-                print("Downloading", os.path.abspath(compressed_weights_path))
-                import requests
-                url = f'https://ml.informatik.uni-freiburg.de/research-artifacts/lcpfn/{name + ".gz"}'
-                r = requests.get(url, allow_redirects=True)
-                os.makedirs(os.path.dirname(compressed_weights_path), exist_ok=True)
-                with open(compressed_weights_path, "wb") as f:
-                    f.write(r.content)
-            if os.path.exists(compressed_weights_path):
-                print("Unzipping", name)
-                os.system(f"gzip -dk {compressed_weights_path}")
-            else:
-                print("Failed to find", compressed_weights_path)
-                print(
-                    "Make sure you have an internet connection to download the model automatically.."
-                )
-        if os.path.exists(weights_path):
-            print("Successfully located model at", weights_path)
-model_dict = {
-    "EMSIZE512_NLAYERS12_NBUCKETS1000": os.path.join(
-        os.path.dirname(__file__),
-        model_path,
-        "pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt",
-    ),
-    "EMSIZE512_NLAYERS6_NBUCKETS1000": os.path.join(
-        os.path.dirname(__file__),
-        model_path,
-        "pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt",
-    ),
-}
-def __getattr__(name):
-    if name in model_dict:
-        if not os.path.exists(model_dict[name]):
-            print(
-                "Can't find",
-                os.path.abspath(model_dict[name]),
-                "thus unzipping/downloading models now.",
-            )
-            print("This might take a while..")
-            prepare_models()
-        return model_dict[name]
-    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
-from .version import __version__
-from lcpfn.model import LCPFN
-from lcpfn.train_lcpfn import train_lcpfn
-from lcpfn.domhan_prior import sample_from_prior, create_get_batch_func
-__all__ = [
-    "LCPFN",
-    "train_lcpfn",
-    "sample_from_prior",
-    "create_get_batch_func",
-    "__version__",
-]

lcpfn/bar_distribution.py DELETED Viewed

@@ -1,349 +0,0 @@
-import torch
-from torch import nn
-class BarDistribution(nn.Module):
-    def __init__(
-        self, borders: torch.Tensor, smoothing=0.0
-    ):  # here borders should start with min and end with max, where all values lie in (min,max) and are sorted
-        # sorted list of borders
-        super().__init__()
-        assert len(borders.shape) == 1
-        # self.borders = borders
-        self.register_buffer("borders", borders)
-        self.register_buffer("smoothing", torch.tensor(smoothing))
-        # self.bucket_widths = self.borders[1:] - self.borders[:-1]
-        self.register_buffer("bucket_widths", self.borders[1:] - self.borders[:-1])
-        full_width = self.bucket_widths.sum()
-        border_order = torch.argsort(borders)
-        assert (
-            full_width - (self.borders[-1] - self.borders[0])
-        ).abs() < 1e-4, f"diff: {full_width - (self.borders[-1] - self.borders[0])}"
-        assert (
-            border_order == torch.arange(len(borders)).to(border_order.device)
-        ).all(), "Please provide sorted borders!"
-        self.num_bars = len(borders) - 1
-    def map_to_bucket_idx(self, y):
-        target_sample = torch.searchsorted(self.borders, y) - 1
-        target_sample[y == self.borders[0]] = 0
-        target_sample[y == self.borders[-1]] = self.num_bars - 1
-        return target_sample
-    def forward(
-        self, logits, y
-    ):  # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
-        target_sample = self.map_to_bucket_idx(y)
-        assert (target_sample >= 0).all() and (
-            target_sample < self.num_bars
-        ).all(), f"y {y} not in support set for borders (min_y, max_y) {self.borders}"
-        assert (
-            logits.shape[-1] == self.num_bars
-        ), f"{logits.shape[-1]} vs {self.num_bars}"
-        bucket_log_probs = torch.log_softmax(logits, -1)
-        scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
-        # print(bucket_log_probs, logits.shape)
-        nll_loss = -scaled_bucket_log_probs.gather(
-            -1, target_sample.unsqueeze(-1)
-        ).squeeze(-1)
-        smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
-        smoothing = self.smoothing if self.training else 0.0
-        loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
-        return loss
-    def mean(self, logits):
-        bucket_means = self.borders[:-1] + self.bucket_widths / 2
-        p = torch.softmax(logits, -1)
-        return p @ bucket_means
-    def icdf(self, logits, left_prob):
-        """
-        Implementation of the quantile function
-        :param logits: Tensor of any shape, with the last dimension being logits
-        :param left_prob: float: The probability mass to the left of the result.
-        :return: Position with `left_prob` probability weight to the left.
-        """
-        probs = logits.softmax(-1)
-        cumprobs = torch.cumsum(probs, -1)
-        idx = (
-            torch.searchsorted(
-                cumprobs,
-                left_prob * torch.ones(*cumprobs.shape[:-1], 1, device=probs.device),
-            )
-            .squeeze(-1)
-            .clamp(0, cumprobs.shape[-1] - 1)
-        )  # this might not do the right for outliers
-        cumprobs = torch.cat(
-            [torch.zeros(*cumprobs.shape[:-1], 1, device=logits.device), cumprobs], -1
-        )
-        rest_prob = left_prob - cumprobs.gather(-1, idx[..., None]).squeeze(-1)
-        left_border = self.borders[idx]
-        right_border = self.borders[idx + 1]
-        return left_border + (right_border - left_border) * rest_prob / probs.gather(
-            -1, idx[..., None]
-        ).squeeze(-1)
-    def quantile(self, logits, center_prob=0.682):
-        side_probs = (1.0 - center_prob) / 2
-        return torch.stack(
-            (self.icdf(logits, side_probs), self.icdf(logits, 1.0 - side_probs)), -1
-        )
-    def ucb(self, logits, best_f, rest_prob=(1 - 0.682) / 2, maximize=True):
-        """
-        UCB utility. Rest Prob is the amount of utility above (below) the confidence interval that is ignored.
-        Higher rest_prob is equivalent to lower beta in the standard GP-UCB formulation.
-        :param logits: Logits, as returned by the Transformer.
-        :param best_f: Only here, since the other utilities have it.
-        :param rest_prob: The amount of utility above (below) the confidence interval that is ignored.
-        The default is equivalent to using GP-UCB with `beta=1`.
-        To get the corresponding `beta`, where `beta` is from
-        the standard GP definition of UCB `ucb_utility = mean + beta * std`,
-        you can use this computation: `beta = math.sqrt(2)*torch.erfinv(torch.tensor(2*rest_prob-1))`.
-        :param maximize:
-        :return: utility
-        """
-        if maximize:
-            rest_prob = 1 - rest_prob
-        return self.icdf(logits, rest_prob)
-    def mode(self, logits):
-        mode_inds = logits.argmax(-1)
-        bucket_means = self.borders[:-1] + self.bucket_widths / 2
-        return bucket_means[mode_inds]
-    def ei(
-        self, logits, best_f, maximize=True
-    ):  # logits: evaluation_points x batch x feature_dim
-        bucket_means = self.borders[:-1] + self.bucket_widths / 2
-        if maximize:
-            bucket_contributions = torch.tensor(
-                [
-                    max((bucket_max + max(bucket_min, best_f)) / 2 - best_f, 0)
-                    for bucket_min, bucket_max, bucket_mean in zip(
-                        self.borders[:-1], self.borders[1:], bucket_means
-                    )
-                ],
-                dtype=logits.dtype,
-                device=logits.device,
-            )
-        else:
-            bucket_contributions = torch.tensor(
-                [
-                    -min((min(bucket_max, best_f) + bucket_min) / 2 - best_f, 0)
-                    for bucket_min, bucket_max, bucket_mean in zip(  # min on max instead of max on min, and compare min < instead of max >
-                        self.borders[:-1], self.borders[1:], bucket_means
-                    )
-                ],
-                dtype=logits.dtype,
-                device=logits.device,
-            )
-        p = torch.softmax(logits, -1)
-        return p @ bucket_contributions
-    def pi(
-        self, logits, best_f, maximize=True
-    ):  # logits: evaluation_points x batch x feature_dim
-        """
-        Acquisition Function: Probability of Improvement
-        :param logits: as returned by Transformer
-        :param best_f: best evaluation so far (the incumbent)
-        :param maximize: whether to maximize
-        :return: utility
-        """
-        assert maximize is True
-        p = torch.softmax(logits, -1)
-        border_widths = self.borders[1:] - self.borders[:-1]
-        factor = 1.0 - ((best_f - self.borders[:-1]) / border_widths).clamp(0.0, 1.0)
-        return (p * factor).sum(-1)
-    def mean_of_square(self, logits):
-        """
-        Computes E[x^2].
-        :param logits: Output of the model.
-        """
-        left_borders = self.borders[:-1]
-        right_borders = self.borders[1:]
-        bucket_mean_of_square = (
-            left_borders.square()
-            + right_borders.square()
-            + left_borders * right_borders
-        ) / 3.0
-        p = torch.softmax(logits, -1)
-        return p @ bucket_mean_of_square
-    def variance(self, logits):
-        return self.mean_of_square(logits) - self.mean(logits).square()
-class FullSupportBarDistribution(BarDistribution):
-    @staticmethod
-    def halfnormal_with_p_weight_before(range_max, p=0.5):
-        s = range_max / torch.distributions.HalfNormal(torch.tensor(1.0)).icdf(
-            torch.tensor(p)
-        )
-        return torch.distributions.HalfNormal(s)
-    def forward(
-        self, logits, y
-    ):  # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
-        assert self.num_bars > 1
-        target_sample = self.map_to_bucket_idx(y)
-        target_sample.clamp_(0, self.num_bars - 1)
-        assert logits.shape[-1] == self.num_bars
-        bucket_log_probs = torch.log_softmax(logits, -1)
-        scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
-        # print(bucket_log_probs, logits.shape)
-        log_probs = scaled_bucket_log_probs.gather(
-            -1, target_sample.unsqueeze(-1)
-        ).squeeze(-1)
-        side_normals = (
-            self.halfnormal_with_p_weight_before(self.bucket_widths[0]),
-            self.halfnormal_with_p_weight_before(self.bucket_widths[-1]),
-        )
-        # TODO look over it again
-        log_probs[target_sample == 0] += side_normals[0].log_prob(
-            (self.borders[1] - y[target_sample == 0]).clamp(min=0.00000001)
-        ) + torch.log(self.bucket_widths[0])
-        log_probs[target_sample == self.num_bars - 1] += side_normals[1].log_prob(
-            y[target_sample == self.num_bars - 1] - self.borders[-2]
-        ) + torch.log(self.bucket_widths[-1])
-        nll_loss = -log_probs
-        smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
-        smoothing = self.smoothing if self.training else 0.0
-        loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
-        return loss
-    def mean(self, logits):
-        bucket_means = self.borders[:-1] + self.bucket_widths / 2
-        p = torch.softmax(logits, -1)
-        side_normals = (
-            self.halfnormal_with_p_weight_before(self.bucket_widths[0]),
-            self.halfnormal_with_p_weight_before(self.bucket_widths[-1]),
-        )
-        bucket_means[0] = -side_normals[0].mean + self.borders[1]
-        bucket_means[-1] = side_normals[1].mean + self.borders[-2]
-        return p @ bucket_means
-def get_bucket_limits_(
-    num_outputs: int,
-    full_range: tuple = None,
-    ys: torch.Tensor = None,
-    verbose: bool = False,
-):
-    assert (ys is not None) or (full_range is not None)
-    if ys is not None:
-        ys = ys.flatten()
-        if len(ys) % num_outputs:
-            ys = ys[: -(len(ys) % num_outputs)]
-        print(
-            f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
-        )
-        ys_per_bucket = len(ys) // num_outputs
-        if full_range is None:
-            full_range = (ys.min(), ys.max())
-        else:
-            assert full_range[0] <= ys.min() and full_range[1] >= ys.max()
-            full_range = torch.tensor(full_range)
-        ys_sorted, ys_order = ys.sort(0)
-        bucket_limits = (
-            ys_sorted[ys_per_bucket - 1 :: ys_per_bucket][:-1]
-            + ys_sorted[ys_per_bucket::ys_per_bucket]
-        ) / 2
-        if verbose:
-            print(
-                f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
-            )
-            print(full_range)
-        bucket_limits = torch.cat(
-            [full_range[0].unsqueeze(0), bucket_limits, full_range[1].unsqueeze(0)], 0
-        )
-    else:
-        class_width = (full_range[1] - full_range[0]) / num_outputs
-        bucket_limits = torch.cat(
-            [
-                full_range[0] + torch.arange(num_outputs).float() * class_width,
-                torch.tensor(full_range[1]).unsqueeze(0),
-            ],
-            0,
-        )
-    assert (
-        len(bucket_limits) - 1 == num_outputs
-        and full_range[0] == bucket_limits[0]
-        and full_range[-1] == bucket_limits[-1]
-    )
-    return bucket_limits
-def get_bucket_limits(
-    num_outputs: int,
-    full_range: tuple = None,
-    ys: torch.Tensor = None,
-    verbose: bool = False,
-):
-    assert (ys is None) != (
-        full_range is None
-    ), "Either full_range or ys must be passed."
-    if ys is not None:
-        ys = ys.flatten()
-        ys = ys[~torch.isnan(ys)]
-        if len(ys) % num_outputs:
-            ys = ys[: -(len(ys) % num_outputs)]
-        print(
-            f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
-        )
-        ys_per_bucket = len(ys) // num_outputs
-        if full_range is None:
-            full_range = (ys.min(), ys.max())
-        else:
-            assert (
-                full_range[0] <= ys.min() and full_range[1] >= ys.max()
-            ), f"full_range {full_range} not in range of ys {ys.min(), ys.max()}"
-            full_range = torch.tensor(full_range)
-        ys_sorted, ys_order = ys.sort(0)
-        bucket_limits = (
-            ys_sorted[ys_per_bucket - 1 :: ys_per_bucket][:-1]
-            + ys_sorted[ys_per_bucket::ys_per_bucket]
-        ) / 2
-        if verbose:
-            print(
-                f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
-            )
-            print(full_range)
-        bucket_limits = torch.cat(
-            [full_range[0].unsqueeze(0), bucket_limits, full_range[1].unsqueeze(0)], 0
-        )
-    else:
-        class_width = (full_range[1] - full_range[0]) / num_outputs
-        bucket_limits = torch.cat(
-            [
-                full_range[0] + torch.arange(num_outputs).float() * class_width,
-                torch.tensor(full_range[1]).unsqueeze(0),
-            ],
-            0,
-        )
-    assert (
-        len(bucket_limits) - 1 == num_outputs
-    ), f"len(bucket_limits) - 1 == {len(bucket_limits) - 1} != {num_outputs} == num_outputs"
-    assert full_range[0] == bucket_limits[0], f"{full_range[0]} != {bucket_limits[0]}"
-    assert (
-        full_range[-1] == bucket_limits[-1]
-    ), f"{full_range[-1]} != {bucket_limits[-1]}"
-    return bucket_limits

lcpfn/curves.py DELETED Viewed

@@ -1,277 +0,0 @@
-import numpy as np
-from collections import OrderedDict
-prior = {
-    "pow3": {
-        "uniform": OrderedDict(
-            a={"type": "uniform", "param1": -1, "param2": 1},
-            c={"type": "uniform", "param1": 0, "param2": 1},
-            alpha={"type": "uniform", "param1": 0, "param2": 1},
-        ),
-        "peaked": OrderedDict(
-            a={"type": "uniform", "param1": -0.6, "param2": 0.6},
-            c={"type": "uniform", "param1": 0, "param2": 1.25},
-            alpha={"type": "log_normal", "param1": 0, "param2": 2},
-        ),
-    },
-    "ilog2": {
-        "uniform": OrderedDict(
-            c={"type": "uniform", "param1": 0, "param2": 1},
-            a={"type": "uniform", "param1": -1, "param2": 1},
-        ),
-        "peaked": OrderedDict(
-            c={"type": "uniform", "param1": 0, "param2": 1},
-            a={"type": "uniform", "param1": -0.5, "param2": 0.5},
-        ),
-    },
-    "janoschek": {
-        "uniform": OrderedDict(
-            a={"type": "uniform", "param1": 0, "param2": 1},
-            beta={"type": "uniform", "param1": 0, "param2": 2},
-            k={"type": "uniform", "param1": 0, "param2": 1},
-            delta={"type": "uniform", "param1": -5, "param2": 5},
-        ),
-        "peaked": OrderedDict(
-            a={"type": "uniform", "param1": 0, "param2": 1},
-            beta={"type": "uniform", "param1": 0, "param2": 2},
-            k={"type": "log_normal", "param1": -2, "param2": 1},
-            delta={"type": "log_normal", "param1": 0, "param2": 0.5},
-        ),
-    },
-}
-def prior_sampler(rng, type, param1, param2):
-    if type == "uniform":
-        return rng.uniform(param1, param2)
-    elif type == "log_normal":
-        return rng.lognormal(param1, param2)
-    raise Exception("Unknown prior type: {}".format(type))
-def pow3(x, c, a, alpha):
-    return c - a * (x) ** (-alpha)
-def prior_pow3(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["pow3"]["peaked"][p]["type"],
-            param1=prior["pow3"]["peaked"][p]["param1"],
-            param2=prior["pow3"]["peaked"][p]["param2"],
-        )
-        for p in ["a", "c", "alpha"]
-    }
-def uniform_prior_pow3(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["pow3"]["uniform"][p]["type"],
-            param1=prior["pow3"]["uniform"][p]["param1"],
-            param2=prior["pow3"]["uniform"][p]["param2"],
-        )
-        for p in ["a", "c", "alpha"]
-    }
-def ilog2(x, c, a):
-    return c - a / (np.log(x + 1))
-def prior_ilog2(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["ilog2"]["peaked"][p]["type"],
-            param1=prior["ilog2"]["peaked"][p]["param1"],
-            param2=prior["ilog2"]["peaked"][p]["param2"],
-        )
-        for p in ["a", "c"]
-    }
-def uniform_prior_ilog2(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["ilog2"]["uniform"][p]["type"],
-            param1=prior["ilog2"]["uniform"][p]["param1"],
-            param2=prior["ilog2"]["uniform"][p]["param2"],
-        )
-        for p in ["a", "c"]
-    }
-def janoschek(x, a, beta, k, delta):
-    """
-    http://www.pisces-conservation.com/growthhelp/janoschek.htm
-    """
-    return a - (a - beta) * np.exp(-k * x**delta)
-def prior_janoschek(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["janoschek"]["peaked"][p]["type"],
-            param1=prior["janoschek"]["peaked"][p]["param1"],
-            param2=prior["janoschek"]["peaked"][p]["param2"],
-        )
-        for p in ["a", "beta", "k", "delta"]
-    }
-def uniform_prior_janoschek(rng):
-    return {
-        p: prior_sampler(
-            rng,
-            prior["janoschek"]["uniform"][p]["type"],
-            param1=prior["janoschek"]["uniform"][p]["param1"],
-            param2=prior["janoschek"]["uniform"][p]["param2"],
-        )
-        for p in ["a", "beta", "k", "delta"]
-    }
-def log_power(x, a, b, c):
-    # a: upper bound
-    # c: growth rate
-    # initial = a/ (1 + (1/e^b)^c
-    return a / (1.0 + (x / np.exp(b)) ** c)
-def prior_log_power(rng):
-    # a ~ N(0.8,0.1)
-    # b ~ N(1,1)
-    # c ~ U(-3,0)
-    a = rng.normal(0.8, 0.1)
-    b = rng.normal(1.0, 1.0)
-    c = rng.uniform(-3.0, 0.0)
-    return {"a": a, "b": b, "c": c}
-def weibull(x, alpha, beta, kappa, delta):
-    """
-    Weibull modell
-    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
-    alpha: upper asymptote
-    beta: lower asymptote
-    k: growth rate
-    delta: controls the x-ordinate for the point of inflection
-    """
-    return alpha - (alpha - beta) * np.exp(-((kappa * x) ** delta))
-def prior_weibull(rng):
-    alpha = rng.uniform(0.0, 1.5)
-    beta = rng.uniform(0.0, 1)
-    kappa = np.exp(rng.normal(-2.0, 1.0))
-    delta = np.exp(rng.normal(0, 0.5))
-    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
-def mmf(x, alpha, beta, kappa, delta):
-    """
-    Morgan-Mercer-Flodin
-    description:
-    Nonlinear Regression page 342
-    http://bit.ly/1jodG17
-    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
-    alpha: upper asymptote
-    kappa: growth rate
-    beta: initial value
-    delta: controls the point of inflection
-    """
-    return alpha - (alpha - beta) / (1.0 + (kappa * x) ** delta)
-def prior_mmf(rng):
-    # alpha ~ N(0.8,0.1)
-    # beta ~ N(0.2,0.1)
-    # ln(kappa) ~ N(0,2)
-    # ln(delta) ~ N(0,1)
-    alpha = rng.normal(0.8, 0.1)
-    beta = rng.normal(0.2, 0.1)
-    kappa = np.exp(rng.normal(0, 2))
-    delta = np.exp(rng.normal(0, 1))
-    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
-def vap(x, a, b, c):
-    """Vapor pressure model"""
-    # no upper bound if c > 0
-    # a = ln(upper bound) for c=0
-    # a+b = ln(initial)
-    return np.exp(a + b / x + c * np.log(x))
-def prior_vap(rng):
-    a = rng.uniform(-2.0, 0.0)  # @heri: range check
-    b = rng.uniform(-4.0, 0.0)  # @heri: range check
-    c = np.exp(rng.uniform(-8.0, 0.0))  # @heri: same as weights
-    return {"a": a, "b": b, "c": c}
-def loglog_linear(x, a, b):
-    x = np.log(x)
-    return np.log(a * x + b)
-def prior_loglog_linear(rng):
-    # ln(a) ~ N(-2, 1)
-    # ln(b) ~ U(0, 1)
-    a = np.exp(rng.normal(-2.0, 1.0))
-    b = np.exp(rng.uniform(0.0, 1.0))
-    return {"a": a, "b": b}
-def exp4(x, c, a, b, alpha):
-    return c - np.exp(-a * (x**alpha) + b)
-def prior_exp4(rng):
-    # c ~ N(0.8,0.1)
-    c = rng.normal(0.8, 0.1)
-    # ln(a) ~ N(-2,1)
-    a = np.exp(rng.normal(-2, 1))
-    # ln(alpha) ~ N(0,1)
-    alpha = np.exp(rng.normal(0, 1))
-    # ln(b) ~ N(0,0.5)
-    b = np.exp(rng.normal(0, 0.5))
-    return {"a": a, "b": b, "c": c, "alpha": alpha}
-def pow4(x, c, a, b, alpha):
-    return c - (a * x + b) ** -alpha
-def prior_pow4(rng):
-    # ln(1 - c) ~ U(-5, 0)
-    c = 1 - np.exp(rng.uniform(-5.0, 0))
-    # ln(a) ~ N(-3, 2)
-    a = np.exp(rng.normal(-3.0, 2))
-    # ln(alpha) ~ N(0,1)
-    alpha = np.exp(rng.normal(0, 1))
-    # ln(b) ~ U(0, 1)
-    b = np.exp(rng.uniform(0, 1))
-    return {"a": a, "b": b, "c": c, "alpha": alpha}
-def dr_hill_zero_background(x, theta, eta, kappa):
-    # theta: upper bound
-    # eta: growth rate
-    # initial = theta/(kappa^eta + 1)
-    return (theta * x**eta) / (kappa**eta + x**eta)
-def prior_dr_hill_zero_background(rng):
-    # theta ~ U(1,0) N(0.8,0.1)
-    # ln(eta) ~ N(1,1)
-    # ln(kappa) ~ N(1,2)
-    theta = rng.normal(0.8, 0.1)
-    eta = np.exp(rng.normal(1.0, 1.0))
-    kappa = np.exp(rng.normal(1.0, 2.0))
-    return {"theta": theta, "eta": eta, "kappa": kappa}

lcpfn/decoders.py DELETED Viewed

@@ -1,42 +0,0 @@
-import torch
-from torch import nn
-import random
-from torch import Tensor
-import torch.nn.functional as F
-class GELU(nn.Module):
-    def forward(self, input: Tensor) -> Tensor:
-        return F.gelu(input)
-class ScaledDecoder(nn.Module):
-    def __init__(self, ninp, nhid, nout):
-        super().__init__()
-        self.linear = nn.Linear(ninp, nhid)
-        self.linear1 = nn.Linear(nhid, nout)
-        self.linear2 = nn.Linear(nhid, 10)
-    def forward(self, x):
-        # return torch.cat([self.linear1(x), self.linear2(x)], -1)
-        x = self.linear(x)
-        x = GELU()(x)
-        temps = self.linear2(x).softmax(-1) @ torch.tensor(
-            [1.0, 1.4, 1.7, 2.0, 5.0, 10.0, 20.0, 40.0, 80.0, 160.0], device=x.device
-        )
-        if random.random() > 0.99:
-            print(temps.shape, temps[:, :2])
-        return self.linear1(x) / temps.unsqueeze(-1)
-class FixedScaledDecoder(nn.Module):
-    def __init__(self, ninp, nhid, nout):
-        super().__init__()
-        self.mapper = nn.Sequential(
-            nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, nout)
-        )
-        self.T = nn.Parameter(torch.ones(10000) / 10000)
-    def forward(self, x):
-        return self.mapper(x) / self.T.sum()

lcpfn/domhan_prior.py DELETED Viewed

@@ -1,199 +0,0 @@
-from functools import partial
-import torch
-import numpy as np
-from lcpfn.curves import (
-    pow3,
-    ilog2,
-    janoschek,
-    log_power,
-    prior_ilog2,
-    uniform_prior_pow3,
-    weibull,
-    mmf,
-    vap,
-    loglog_linear,
-    exp4,
-    pow4,
-    dr_hill_zero_background,
-)
-from lcpfn.curves import (
-    prior_pow3,
-    prior_janoschek,
-    prior_log_power,
-    prior_weibull,
-    prior_mmf,
-    prior_vap,
-    prior_loglog_linear,
-    prior_exp4,
-    prior_pow4,
-    prior_dr_hill_zero_background,
-)
-from lcpfn.curves import (
-    uniform_prior_pow3,
-    uniform_prior_ilog2,
-    uniform_prior_janoschek,
-)
-def prior_weights(
-    rng,
-    components=[
-        "pow3",
-        "ilog2",
-        "janoschek",
-        "log_power",
-        "weibull",
-        "mmf",
-        "vap",
-        "loglog_linear",
-        "exp4",
-        "pow4",
-        "dr_hill_zero_background",
-    ],
-):
-    K = len(components)
-    weights = rng.uniform(0.0, 1, size=(K,))
-    return {f: weights[i] for i, f in enumerate(components)}
-def sample_from_prior(rng, seq_len=100):
-    return sample_prior_comb(
-        rng=rng,
-        seq_len=seq_len,
-        components=["pow3", "ilog2", "janoschek"],
-        distribution="peaked",
-    )
-def sample_prior_comb(
-    rng,
-    components,
-    distribution,
-    var_lnloc=-4,
-    var_lnscale=1,
-    range_constraint=True,
-    seq_len=100,
-):
-    f_components = {
-        "pow3": pow3,
-        "ilog2": ilog2,
-        "janoschek": janoschek,
-        "log_power": log_power,
-        "weibull": weibull,
-        "mmf": mmf,
-        "vap": vap,
-        "loglog_linear": loglog_linear,
-        "exp4": exp4,
-        "pow4": pow4,
-        "dr_hill_zero_background": dr_hill_zero_background,
-    }
-    if distribution == "peaked":
-        f_priors = {
-            "pow3": prior_pow3,
-            "ilog2": prior_ilog2,
-            "janoschek": prior_janoschek,
-            "log_power": prior_log_power,
-            "weibull": prior_weibull,
-            "mmf": prior_mmf,
-            "vap": prior_vap,
-            "loglog_linear": prior_loglog_linear,
-            "exp4": prior_exp4,
-            "pow4": prior_pow4,
-            "dr_hill_zero_background": prior_dr_hill_zero_background,
-        }
-    elif distribution == "uniform":
-        f_priors = {
-            "pow3": uniform_prior_pow3,
-            "ilog2": uniform_prior_ilog2,
-            "janoschek": uniform_prior_janoschek,
-        }
-    else:
-        raise NotImplemented()
-    x = np.arange(1, seq_len + 1)
-    while True:
-        # sample the noiseless curve
-        weights = prior_weights(rng, components=components)
-        y = np.zeros(x.shape, dtype="float")
-        kwargs = 0
-        for f, w in weights.items():
-            kwargs = f_priors[f](rng)
-            # print(f_components[f](x, **kwargs))
-            y += w * f_components[f](x, **kwargs)
-        # add noise (can exceed [0,1], but afaik no way to implement this prior in Tobis work)
-        var = np.exp(
-            rng.normal(var_lnloc, var_lnscale)
-        )  # @heri: ln_prob =+ log(normal.pdf(log(var), loc=var_lnloc, scale=var_lnscale))
-        # reject any curves that are non-increasing, exceed the [0,1] range
-        if (
-            y[-1] <= y[0]
-            or (range_constraint and (np.any(y < 0) or np.any(y > 1)))
-            or np.isnan(y).any()
-        ):
-            continue
-        else:
-            break
-    def curve():  # generates a sample from the same model, but with independent noise
-        y_noisy = y + rng.normal(np.zeros_like(y), var)
-        return y, y_noisy
-    return curve
-def generate_prior_dataset(n, prior=sample_prior_comb, seed=42):
-    """
-    Returns a fixed sample from the prior (with fixed seq_len) as an n x seq_len np.ndarray
-    """
-    rng = np.random.RandomState(seed)
-    prior_data = np.stack([prior(rng)()[1] for _ in range(n)])
-    return prior_data
-def create_get_batch_func(prior):
-    return partial(get_batch_domhan, prior=prior)
-# function producing batches for PFN training
-def get_batch_domhan(
-    batch_size,
-    seq_len,
-    num_features,
-    prior,
-    device="cpu",
-    noisy_target=True,
-    **_,
-):
-    assert num_features == 1
-    x = np.arange(1, seq_len + 1)
-    y_target = np.empty((batch_size, seq_len), dtype=float)
-    y_noisy = np.empty((batch_size, seq_len), dtype=float)
-    for i in range(batch_size):
-        curve_func = prior(np.random, seq_len=seq_len)  # uses numpy rng
-        if noisy_target:
-            _, y_noisy[i] = curve_func()
-            y_target[i] = y_noisy[i]
-        else:
-            y_target[i], y_noisy[i] = curve_func()
-    # turn numpy arrays into correctly shaped torch tensors & move them to device
-    x = (
-        torch.arange(1, seq_len + 1)
-        .repeat((num_features, batch_size, 1))
-        .transpose(2, 0)
-        .to(device)
-    )
-    y_target = torch.from_numpy(y_target).transpose(1, 0).to(device)
-    y_noisy = torch.from_numpy(y_noisy).transpose(1, 0).to(device)
-    # changes
-    x = x.float()
-    y_target = y_target.float()
-    y_noisy = y_noisy.float()
-    return x, y_noisy, y_target

lcpfn/encoders.py DELETED Viewed

@@ -1,190 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from lcpfn.utils import normalize_data
-import torch.nn.functional as F
-from torch.nn import TransformerEncoder, TransformerEncoderLayer
-class StyleEncoder(nn.Module):
-    def __init__(self, em_size, hyperparameter_definitions):
-        super().__init__()
-        self.em_size = em_size
-        self.embedding = nn.Linear(hyperparameter_definitions.shape[0], self.em_size)
-    def forward(self, hyperparameters):  # T x B x num_hps
-        return self.embedding(hyperparameters)
-class _PositionalEncoding(nn.Module):
-    def __init__(self, d_model, dropout=0.0):
-        super().__init__()
-        self.dropout = nn.Dropout(p=dropout)
-        self.d_model = d_model
-        self.device_test_tensor = nn.Parameter(torch.tensor(1.0))
-    def forward(self, x):  # T x B x num_features
-        assert self.d_model % x.shape[-1] * 2 == 0
-        d_per_feature = self.d_model // x.shape[-1]
-        pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
-        # position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        interval_size = 10
-        div_term = (
-            (1.0 / interval_size)
-            * 2
-            * math.pi
-            * torch.exp(
-                torch.arange(
-                    0, d_per_feature, 2, device=self.device_test_tensor.device
-                ).float()
-                * math.log(math.sqrt(2))
-            )
-        )
-        # print(div_term/2/math.pi)
-        pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
-        pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
-        return self.dropout(pe).view(x.shape[0], x.shape[1], self.d_model)
-Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)
-class EmbeddingEncoder(nn.Module):
-    def __init__(self, num_features, em_size, num_embs=100):
-        super().__init__()
-        self.num_embs = num_embs
-        self.embeddings = nn.Embedding(num_embs * num_features, em_size, max_norm=True)
-        self.init_weights(0.1)
-        self.min_max = (-2, +2)
-    @property
-    def width(self):
-        return self.min_max[1] - self.min_max[0]
-    def init_weights(self, initrange):
-        self.embeddings.weight.data.uniform_(-initrange, initrange)
-    def discretize(self, x):
-        split_size = self.width / self.num_embs
-        return (x - self.min_max[0] // split_size).int().clamp(0, self.num_embs - 1)
-    def forward(self, x):  # T x B x num_features
-        x_idxs = self.discretize(x)
-        x_idxs += (
-            torch.arange(x.shape[-1], device=x.device).view(1, 1, -1) * self.num_embs
-        )
-        # print(x_idxs,self.embeddings.weight.shape)
-        return self.embeddings(x_idxs).mean(-2)
-class Normalize(nn.Module):
-    def __init__(self, mean, std):
-        super().__init__()
-        self.mean = mean
-        self.std = std
-    def forward(self, x):
-        return (x - self.mean) / self.std
-def get_normalized_uniform_encoder(encoder_creator):
-    """
-    This can be used to wrap an encoder that is fed uniform samples in [0,1] and normalizes these to 0 mean and 1 std.
-    For example, it can be used as `encoder_creator = get_normalized_uniform_encoder(encoders.Linear)`, now this can
-    be initialized with `encoder_creator(feature_dim, in_dim)`.
-    :param encoder:
-    :return:
-    """
-    return lambda in_dim, out_dim: nn.Sequential(
-        Normalize(0.5, math.sqrt(1 / 12)), encoder_creator(in_dim, out_dim)
-    )
-Linear = nn.Linear
-MLP = lambda num_features, emsize: nn.Sequential(
-    nn.Linear(num_features + 1, emsize * 2), nn.ReLU(), nn.Linear(emsize * 2, emsize)
-)
-class NanHandlingEncoder(nn.Module):
-    def __init__(self, num_features, emsize, keep_nans=True):
-        super().__init__()
-        self.num_features = 2 * num_features if keep_nans else num_features
-        self.emsize = emsize
-        self.keep_nans = keep_nans
-        self.layer = nn.Linear(self.num_features, self.emsize)
-    def forward(self, x):
-        if self.keep_nans:
-            x = torch.cat(
-                [
-                    torch.nan_to_num(x, nan=0.0),
-                    normalize_data(
-                        torch.isnan(x) * -1
-                        + torch.logical_and(torch.isinf(x), torch.sign(x) == 1) * 1
-                        + torch.logical_and(torch.isinf(x), torch.sign(x) == -1) * 2
-                    ),
-                ],
-                -1,
-            )
-        else:
-            x = torch.nan_to_num(x, nan=0.0)
-        return self.layer(x)
-class Linear(nn.Linear):
-    def __init__(self, num_features, emsize):
-        super().__init__(num_features, emsize)
-        self.num_features = num_features
-        self.emsize = emsize
-    def forward(self, x):
-        x = torch.nan_to_num(x, nan=0.0)
-        return super().forward(x)
-class Conv(nn.Module):
-    def __init__(self, input_size, emsize):
-        super().__init__()
-        self.convs = torch.nn.ModuleList(
-            [nn.Conv2d(64 if i else 1, 64, 3) for i in range(5)]
-        )
-        self.linear = nn.Linear(64, emsize)
-    def forward(self, x):
-        size = math.isqrt(x.shape[-1])
-        assert size * size == x.shape[-1]
-        x = x.reshape(*x.shape[:-1], 1, size, size)
-        for conv in self.convs:
-            if x.shape[-1] < 4:
-                break
-            x = conv(x)
-            x.relu_()
-        x = nn.AdaptiveAvgPool2d((1, 1))(x).squeeze(-1).squeeze(-1)
-        return self.linear(x)
-class CanEmb(nn.Embedding):
-    def __init__(
-        self, num_features, num_embeddings: int, embedding_dim: int, *args, **kwargs
-    ):
-        assert embedding_dim % num_features == 0
-        embedding_dim = embedding_dim // num_features
-        super().__init__(num_embeddings, embedding_dim, *args, **kwargs)
-    def forward(self, x):
-        lx = x.long()
-        assert (lx == x).all(), "CanEmb only works with tensors of whole numbers"
-        x = super().forward(lx)
-        return x.view(*x.shape[:-2], -1)
-def get_Canonical(num_classes):
-    return lambda num_features, emsize: CanEmb(num_features, num_classes, emsize)
-def get_Embedding(num_embs_per_feature=100):
-    return lambda num_features, emsize: EmbeddingEncoder(
-        num_features, emsize, num_embs=num_embs_per_feature
-    )

lcpfn/initializers.py DELETED Viewed

@@ -1,11 +0,0 @@
-import torch
-from torch import nn
-def get_NormalInitializer(std):
-    def initializer(m):
-        if isinstance(m, nn.Linear):
-            nn.init.normal_(m.weight, 0, std)
-            nn.init.normal_(m.bias, 0, std)
-    return initializer

lcpfn/layer.py DELETED Viewed

@@ -1,179 +0,0 @@
-from functools import partial
-from typing import Optional
-from torch import Tensor
-from torch import nn
-from torch.nn.modules.transformer import *
-from torch.nn.modules.transformer import _get_activation_fn
-from torch.utils.checkpoint import checkpoint
-class TransformerEncoderLayer(nn.Module):
-    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
-    This standard encoder layer is based on the paper "Attention Is All You Need".
-    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
-    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
-    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
-    in a different way during application.
-    Args:
-        d_model: the number of expected features in the input (required).
-        nhead: the number of heads in the multiheadattention models (required).
-        dim_feedforward: the dimension of the feedforward network model (default=2048).
-        dropout: the dropout value (default=0.1).
-        activation: the activation function of intermediate layer, relu or gelu (default=relu).
-        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
-        batch_first: If ``True``, then the input and output tensors are provided
-            as (batch, seq, feature). Default: ``False``.
-    Examples::
-        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
-        >>> src = torch.rand(10, 32, 512)
-        >>> out = encoder_layer(src)
-    Alternatively, when ``batch_first`` is ``True``:
-        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
-        >>> src = torch.rand(32, 10, 512)
-        >>> out = encoder_layer(src)
-    """
-    __constants__ = ["batch_first"]
-    def __init__(
-        self,
-        d_model,
-        nhead,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation="relu",
-        layer_norm_eps=1e-5,
-        batch_first=False,
-        pre_norm=False,
-        device=None,
-        dtype=None,
-        recompute_attn=False,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.self_attn = MultiheadAttention(
-            d_model, nhead, dropout=dropout, batch_first=batch_first, **factory_kwargs
-        )
-        # Implementation of Feedforward model
-        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
-        self.dropout = Dropout(dropout)
-        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)
-        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
-        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
-        self.dropout1 = Dropout(dropout)
-        self.dropout2 = Dropout(dropout)
-        self.pre_norm = pre_norm
-        self.recompute_attn = recompute_attn
-        self.activation = _get_activation_fn(activation)
-    def __setstate__(self, state):
-        if "activation" not in state:
-            state["activation"] = F.relu
-        super().__setstate__(state)
-    def forward(
-        self,
-        src: Tensor,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        r"""Pass the input through the encoder layer.
-        Args:
-            src: the sequence to the encoder layer (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        if self.pre_norm:
-            src_ = self.norm1(src)
-        else:
-            src_ = src
-        if isinstance(src_mask, tuple):
-            # global attention setup
-            assert not self.self_attn.batch_first
-            assert src_key_padding_mask is None
-            global_src_mask, trainset_src_mask, valset_src_mask = src_mask
-            num_global_tokens = global_src_mask.shape[0]
-            num_train_tokens = trainset_src_mask.shape[0]
-            global_tokens_src = src_[:num_global_tokens]
-            train_tokens_src = src_[
-                num_global_tokens : num_global_tokens + num_train_tokens
-            ]
-            global_and_train_tokens_src = src_[: num_global_tokens + num_train_tokens]
-            eval_tokens_src = src_[num_global_tokens + num_train_tokens :]
-            attn = (
-                partial(checkpoint, self.self_attn)
-                if self.recompute_attn
-                else self.self_attn
-            )
-            global_tokens_src2 = attn(
-                global_tokens_src,
-                global_and_train_tokens_src,
-                global_and_train_tokens_src,
-                None,
-                True,
-                global_src_mask,
-            )[0]
-            train_tokens_src2 = attn(
-                train_tokens_src,
-                global_tokens_src,
-                global_tokens_src,
-                None,
-                True,
-                trainset_src_mask,
-            )[0]
-            eval_tokens_src2 = attn(
-                eval_tokens_src, src_, src_, None, True, valset_src_mask
-            )[0]
-            src2 = torch.cat(
-                [global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0
-            )
-        else:
-            if self.recompute_attn:
-                src2 = checkpoint(
-                    self.self_attn,
-                    src_,
-                    src_,
-                    src_,
-                    src_key_padding_mask,
-                    True,
-                    src_mask,
-                )[0]
-            else:
-                src2 = self.self_attn(
-                    src_,
-                    src_,
-                    src_,
-                    attn_mask=src_mask,
-                    key_padding_mask=src_key_padding_mask,
-                )[0]
-        src = src + self.dropout1(src2)
-        if not self.pre_norm:
-            src = self.norm1(src)
-        if self.pre_norm:
-            src_ = self.norm2(src)
-        else:
-            src_ = src
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src_))))
-        src = src + self.dropout2(src2)
-        if not self.pre_norm:
-            src = self.norm2(src)
-        return src

lcpfn/model.py DELETED Viewed

@@ -1,56 +0,0 @@
-import torch
-import lcpfn
-import warnings
-from lcpfn import utils
-class LCPFN(torch.nn.Module):
-    def __init__(self, model_name="EMSIZE512_NLAYERS12_NBUCKETS1000"):
-        super(LCPFN, self).__init__()
-        self.model = torch.load(
-            getattr(lcpfn, model_name) if model_name in lcpfn.model_dict else model_name
-        )
-        self.model.eval()
-    def check_input(self, x_train, x_test, y_train, y_test=None):
-        if torch.any(x_train < 0) or torch.any(x_test < 0):
-            # raise warning if input has negative values
-            raise Exception("x values should be non-negative")
-        if torch.any((0 > y_train) | (y_train > 1)) or (
-            y_test is not None and torch.any(0 < y_test < 1)
-        ):
-            # raise warning if input has values outside [0,1]
-            raise Exception(
-                "y values should be in the range [0,1]. Please set normalizer_kwargs accordingly."
-            )
-    @torch.no_grad()
-    def predict_mean(
-        self, x_train, y_train, x_test, normalizer=utils.identity_normalizer()
-    ):
-        y_train_norm = normalizer[0](y_train)
-        logits = self(x_train=x_train, y_train=y_train_norm, x_test=x_test)
-        return normalizer[1](self.model.criterion.mean(logits))
-    @torch.no_grad()
-    def predict_quantiles(
-        self, x_train, y_train, x_test, qs, normalizer=utils.identity_normalizer()
-    ):
-        y_train_norm = normalizer[0](y_train)
-        logits = self(x_train=x_train, y_train=y_train_norm, x_test=x_test)
-        return normalizer[1](
-            torch.cat([self.model.criterion.icdf(logits, q) for q in qs], dim=1)
-        )
-    @torch.no_grad()
-    def nll_loss(self, x_train, y_train, x_test, y_test):
-        # TODO add normalizer_kwargs
-        logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
-        return self.model.criterion(logits, y_test)
-    def forward(self, x_train, y_train, x_test):
-        self.check_input(x_train, x_test, y_train)
-        single_eval_pos = x_train.shape[0]
-        x = torch.cat([x_train, x_test], dim=0).unsqueeze(1)
-        y = y_train.unsqueeze(1)
-        return self.model((x, y), single_eval_pos=single_eval_pos)

lcpfn/positional_encodings.py DELETED Viewed

@@ -1,78 +0,0 @@
-import math
-import torch
-from torch import nn
-# Protocol for positonal encodings.
-# __init__(d_model, max_len=..[, more optionals])
-# forward(x: (seq_len, bs, d_model)) -> Tensor of shape (*x.shape[:2],d_model) containing pos. embeddings
-class NoPositionalEncoding(nn.Module):
-    def __init__(self, d_model, max_len=None):
-        super(NoPositionalEncoding, self).__init__()
-        pass
-    def forward(self, x):
-        return x  # * math.sqrt(x.shape[-1])
-class PositionalEncoding(nn.Module):
-    def __init__(self, d_model, max_len=5000):
-        super(PositionalEncoding, self).__init__()
-        pe = torch.zeros(max_len, d_model)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
-        )
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer("pe", pe)
-    def forward(self, x):
-        x = self.pe[: x.size(0), :] + x  # * math.sqrt(x.shape[-1])
-        return x
-class LearnedPositionalEncoding(nn.Module):
-    def __init__(self, d_model, max_len=5000):
-        super(LearnedPositionalEncoding, self).__init__()
-        self.max_seq_len = max_len
-        # self.positional_embeddings = nn.Embedding(max_len, d_model)
-        self.positional_embeddings = nn.Parameter(torch.empty(max_len, d_model))
-        nn.init.normal_(self.positional_embeddings, mean=0, std=d_model**-0.5)
-    def forward(self, x):
-        seq_len, bs, d_model = x.shape
-        assert seq_len <= len(
-            self.positional_embeddings
-        ), "seq_len can be at most max_len."
-        pos_emb = self.positional_embeddings[:seq_len]
-        return (
-            pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x
-        )  # * math.sqrt(x.shape[-1])
-class PairedScrambledPositionalEncodings(LearnedPositionalEncoding):
-    # TODO check whether it is a problem to use the same perm. for full batch
-    def forward(self, x):
-        seq_len, bs, d_model = x.shape
-        assert seq_len <= len(
-            self.positional_embeddings
-        ), "seq_len can be at most max_len."
-        assert (
-            len(self.positional_embeddings) % 2 == 0
-        ), "Please specify an even max_len."
-        paired_embs = self.positional_embeddings.view(
-            len(self.positional_embeddings), -1, 2
-        )
-        pos_emb = paired_embs[torch.randperm(len(paired_embs))].view(
-            *self.positional_embeddings.shape
-        )[:seq_len]
-        return (
-            pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x
-        )  # * math.sqrt(x.shape[-1])

lcpfn/priors/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from . import gp, ridge

lcpfn/priors/binarized_regression.py DELETED Viewed

@@ -1,19 +0,0 @@
-from . import fast_gp, fast_gp_mix
-from .utils import get_batch_to_dataloader
-def regression_prior_to_binary(get_batch_function):
-    def binarized_get_batch_function(*args, assert_on=False, **kwargs):
-        x, y, target_y = get_batch_function(*args, **kwargs)
-        if assert_on:
-            assert y is target_y, "y == target_y is assumed by this function"
-        y = y.sigmoid().bernoulli()
-        return x, y, y
-    return binarized_get_batch_function
-Binarized_fast_gp_dataloader = get_batch_to_dataloader(regression_prior_to_binary(fast_gp.get_batch))
-Binarized_fast_gp_mix_dataloader = get_batch_to_dataloader(regression_prior_to_binary(fast_gp_mix.get_batch))

lcpfn/priors/fast_gp.py DELETED Viewed

@@ -1,143 +0,0 @@
-import time
-import torch
-from torch import nn
-import gpytorch
-from .utils import get_batch_to_dataloader
-from utils import default_device
-# We will use the simplest form of GP model, exact inference
-class ExactGPModel(gpytorch.models.ExactGP):
-    def __init__(self, train_x, train_y, likelihood):
-        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
-        self.mean_module = gpytorch.means.ConstantMean()
-        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
-    def forward(self, x):
-        mean_x = self.mean_module(x)
-        covar_x = self.covar_module(x)
-        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
-def get_model(x, y, hyperparameters):
-    likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.GreaterThan(1.e-9))
-    model = ExactGPModel(x, y, likelihood)
-    model.likelihood.noise = torch.ones_like(model.likelihood.noise) * hyperparameters["noise"]
-    model.covar_module.outputscale = torch.ones_like(model.covar_module.outputscale) * hyperparameters["outputscale"]
-    model.covar_module.base_kernel.lengthscale = torch.ones_like(model.covar_module.base_kernel.lengthscale) * \
-                                                 hyperparameters["lengthscale"]
-    return model, likelihood
-@torch.no_grad()
-def get_batch(batch_size, seq_len, num_features, device=default_device, hyperparameters=None,
-              equidistant_x=False, fix_x=None, **kwargs):
-    if isinstance(hyperparameters, (tuple, list)):
-        hyperparameters = {"noise": hyperparameters[0]
-            , "outputscale": hyperparameters[1]
-            , "lengthscale": hyperparameters[2]
-            , "is_binary_classification": hyperparameters[3]
-            # , "num_features_used": hyperparameters[4]
-            , "normalize_by_used_features": hyperparameters[5]
-            , "order_y": hyperparameters[6]
-            , "sampling": hyperparameters[7]
-                           }
-    elif hyperparameters is None:
-        hyperparameters = {"noise": .1, "outputscale": .1, "lengthscale": .1}
-    if 'verbose' in hyperparameters and hyperparameters['verbose']:
-        print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
-                  , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size, 'sampling': hyperparameters['sampling']})
-    # hyperparameters = {k: hyperparameters[k]() if callable(hyperparameters[k]) else hyperparameters[k] for k in
-    #      hyperparameters.keys()}
-    assert not (equidistant_x and (fix_x is not None))
-    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations', (True, True, True))):
-        if equidistant_x:
-            assert num_features == 1
-            x = torch.linspace(0, 1., seq_len).unsqueeze(0).repeat(batch_size, 1).unsqueeze(-1)
-        elif fix_x is not None:
-            assert fix_x.shape == (seq_len, num_features)
-            x = fix_x.unsqueeze(0).repeat(batch_size, 1, 1).to(device)
-        else:
-            if hyperparameters.get('sampling','uniform') == 'uniform':
-                x = torch.rand(batch_size, seq_len, num_features, device=device)
-            else:
-                x = torch.randn(batch_size, seq_len, num_features, device=device)
-        model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
-        model.to(device)
-        # trained_model = ExactGPModel(train_x, train_y, likelihood).cuda()
-        # trained_model.eval()
-        successful_sample = False
-        while not successful_sample:
-            try:
-                with gpytorch.settings.prior_mode(True):
-                    model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
-                    model.to(device)
-                    d = model(x)
-                    sample_wo_noise = d.sample().transpose(0, 1)  # this will be the target for the loss
-                    sample = likelihood(sample_wo_noise).sample()  # this will be the input to the Transformer
-                    successful_sample = True
-            except RuntimeError: # This can happen when torch.linalg.eigh fails. Restart with new init resolves this.
-                print('GP Sampling unsuccessful, retrying.. ')
-                print(x)
-                print(hyperparameters)
-    if bool(torch.any(torch.isnan(x)).detach().cpu().numpy()):
-        print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
-                  , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size})
-    # TODO: Multi output
-    return x.transpose(0, 1), sample, sample if hyperparameters.get("observation_noise", True) else sample_wo_noise
-DataLoader = get_batch_to_dataloader(get_batch)
-def get_model_on_device(x,y,hyperparameters,device):
-    model, likelihood = get_model(x, y, hyperparameters)
-    model.to(device)
-    return model, likelihood
-@torch.no_grad()
-def evaluate(x, y, y_non_noisy, use_mse=False, hyperparameters={}, get_model_on_device=get_model_on_device, device=default_device, step_size=1, start_pos=0):
-    start_time = time.time()
-    losses_after_t = [.0] if start_pos == 0 else []
-    all_losses_after_t = []
-    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))), gpytorch.settings.fast_pred_var(False):
-        for t in range(max(start_pos, 1), len(x), step_size):
-            loss_sum = 0.
-            model, likelihood = get_model_on_device(x[:t].transpose(0, 1), y[:t].transpose(0, 1), hyperparameters, device)
-            model.eval()
-            # print([t.shape for t in model.train_inputs])
-            # print(x[:t].transpose(0,1).shape, x[t].unsqueeze(1).shape, y[:t].transpose(0,1).shape)
-            f = model(x[t].unsqueeze(1))
-            l = likelihood(f)
-            means = l.mean.squeeze()
-            varis = l.covariance_matrix.squeeze()
-            # print(l.variance.squeeze(), l.mean.squeeze(), y[t])
-            assert len(means.shape) == len(varis.shape) == 1
-            assert len(means) == len(varis) == x.shape[1]
-            if use_mse:
-                c = nn.MSELoss(reduction='none')
-                ls = c(means, y[t])
-            else:
-                ls = -l.log_prob(y[t].unsqueeze(1))
-            losses_after_t.append(ls.mean())
-            all_losses_after_t.append(ls.flatten())
-        return torch.stack(all_losses_after_t).to('cpu'), torch.tensor(losses_after_t).to('cpu'), time.time() - start_time
-if __name__ == '__main__':
-    hps = (.1,.1,.1)
-    for redo_idx in range(1):
-        print(
-            evaluate(*get_batch(1000, 10, hyperparameters=hps, num_features=10), use_mse=False, hyperparameters=hps))

lcpfn/priors/fast_gp_mix.py DELETED Viewed

@@ -1,394 +0,0 @@
-import time
-import functools
-import random
-import math
-import traceback
-import numpy as np
-import torch
-from torch import nn
-import gpytorch
-from botorch.models import SingleTaskGP
-from botorch.models.gp_regression import MIN_INFERRED_NOISE_LEVEL
-from botorch.fit import fit_gpytorch_model
-from gpytorch.mlls import ExactMarginalLogLikelihood
-from gpytorch.likelihoods import GaussianLikelihood
-from gpytorch.priors.torch_priors import GammaPrior, UniformPrior
-from gpytorch.constraints import GreaterThan
-from bar_distribution import BarDistribution
-from utils import default_device
-from .utils import get_batch_to_dataloader
-from . import fast_gp
-def get_model(x, y, hyperparameters: dict, sample=True):
-    if hyperparameters.get('handmade', False):
-        # We will use the simplest form of GP model, exact inference
-        class ExactGPModel(gpytorch.models.ExactGP):
-            def __init__(self, train_x, train_y, likelihood):
-                super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
-                self.mean_module = gpytorch.means.ConstantMean()
-                self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel())
-                self.mean_module.register_prior("mean_prior", UniformPrior(-1, 1), "constant")
-                self.covar_module.base_kernel.register_prior("lengthscale_prior", UniformPrior(0.01, 0.5),
-                                                              "lengthscale")
-                # model.covar_module.base_kernel.register_prior("period_length_prior", UniformPrior(0.05, 2.5), "period_length")
-                self.covar_module.register_prior("outputscale_prior", UniformPrior(1, 2), "outputscale")
-                likelihood.register_prior("noise_prior", UniformPrior(0.001, 0.01), "noise")
-                self.to(x)
-            def forward(self, x):
-                mean_x = self.mean_module(x)
-                covar_x = self.covar_module(x)
-                return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
-        likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.Positive())
-        model = ExactGPModel(x, y, likelihood)
-    else:
-        aug_batch_shape = SingleTaskGP(x,y.unsqueeze(-1))._aug_batch_shape
-        noise_prior = GammaPrior(hyperparameters.get('noise_concentration',1.1), hyperparameters.get('noise_rate',0.05))
-        noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
-        likelihood = GaussianLikelihood(
-            noise_prior=noise_prior,
-            batch_shape=aug_batch_shape,
-            noise_constraint=GreaterThan(
-                MIN_INFERRED_NOISE_LEVEL,
-                transform=None,
-                initial_value=noise_prior_mode,
-            ),
-        )
-        model = SingleTaskGP(x, y.unsqueeze(-1),
-                             covar_module=gpytorch.kernels.ScaleKernel(
-                                gpytorch.kernels.MaternKernel(
-                                    nu=hyperparameters.get('nu',2.5),
-                                    ard_num_dims=x.shape[-1],
-                                    batch_shape=aug_batch_shape,
-                                    lengthscale_prior=gpytorch.priors.GammaPrior(hyperparameters.get('lengthscale_concentration',3.0), hyperparameters.get('lengthscale_rate',6.0)),
-                                ),
-                                batch_shape=aug_batch_shape,
-                                outputscale_prior=gpytorch.priors.GammaPrior(hyperparameters.get('outputscale_concentration',.5), hyperparameters.get('outputscale_rate',0.15)),
-                            ), likelihood=likelihood)
-        likelihood = model.likelihood
-        model.to(x.device)
-    if sample:
-        sampled_model = model.pyro_sample_from_prior()
-        return sampled_model, sampled_model.likelihood
-    else:
-        assert not(hyperparameters.get('sigmoid', False)) and not(hyperparameters.get('y_minmax_norm', False)), "Sigmoid and y_minmax_norm can only be used to sample models..."
-        return model, likelihood
-@torch.no_grad()
-def get_batch(batch_size, seq_len, num_features, device=default_device, hyperparameters=None,
-              batch_size_per_gp_sample=None,
-              fix_to_range=None, equidistant_x=False, **kwargs):
-    '''
-    This function is very similar to the equivalent in .fast_gp. The only difference is that this function operates over
-    a mixture of GP priors.
-    :param batch_size:
-    :param seq_len:
-    :param num_features:
-    :param device:
-    :param hyperparameters:
-    :param for_regression:
-    :return:
-    '''
-    hyperparameters = hyperparameters or {}
-    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))):
-        batch_size_per_gp_sample = (batch_size_per_gp_sample or max(batch_size // 10,1))
-        assert batch_size % batch_size_per_gp_sample == 0
-        total_num_candidates = batch_size*(2**(fix_to_range is not None))
-        num_candidates = batch_size_per_gp_sample * (2**(fix_to_range is not None))
-        if equidistant_x:
-            assert num_features == 1
-            x = torch.linspace(0,1.,seq_len).unsqueeze(0).repeat(total_num_candidates,1).unsqueeze(-1)
-        else:
-            x = torch.rand(total_num_candidates, seq_len, num_features, device=device)
-        samples = []
-        samples_wo_noise = []
-        for i in range(0,total_num_candidates,num_candidates):
-            model, likelihood = get_model(x[i:i+num_candidates], torch.zeros(num_candidates,x.shape[1]).to(device), hyperparameters)
-            model.to(device)
-            likelihood.to(device)
-            if hyperparameters.get('handmade', False):
-                model.covar_module.base_kernel.lengthscale = model.covar_module.base_kernel.lengthscale.to(device)
-                model.covar_module.outputscale = model.covar_module.outputscale.to(device)
-                likelihood.noise = likelihood.noise.to(device)
-                model.mean_module.constant = model.mean_module.constant.to(device)
-            # trained_model = ExactGPModel(train_x, train_y, likelihood).cuda()
-            # trained_model.eval()
-            successful_sample = 0
-            throwaway_share = 0.
-            sampling_with_observation_noise =  hyperparameters.get("observation_noise", True)
-            while successful_sample < 1:
-                with gpytorch.settings.prior_mode(True):
-                    #print(x.device, device, f'{model.covar_module.base_kernel.lengthscale=}, {model.covar_module.base_kernel.lengthscale.device=}')
-                    if sampling_with_observation_noise :
-                        d = model(x[i:i+num_candidates])
-                        d = likelihood(d)
-                        sample = d.sample() # bs_per_gp_s x T
-                    else:
-                        d = model(x[i:i+num_candidates])
-                        sample_wo_noise = d.sample()
-                        sample = likelihood(sample_wo_noise).sample()
-                    if hyperparameters.get('y_minmax_norm'):
-                        sample = ((sample - sample.min(1)[0]) / (sample.max(1)[0] - sample.min(1)[0]))
-                    if hyperparameters.get('sigmoid'):
-                        sample = sample.sigmoid()
-                    if not sampling_with_observation_noise:
-                        if hyperparameters.get('y_minmax_norm'):
-                            sample_wo_noise = ((sample_wo_noise - sample_wo_noise.min(1)[0]) / (sample_wo_noise.max(1)[0] - sample_wo_noise.min(1)[0]))
-                        if hyperparameters.get('sigmoid'):
-                            sample_wo_noise = sample_wo_noise.sigmoid()
-                    if fix_to_range is None:
-                        samples.append(sample.transpose(0, 1))
-                        if not sampling_with_observation_noise: samples_wo_noise.append(sample_wo_noise.transpose(0,1))
-                        successful_sample = True
-                        continue
-                    smaller_mask = sample < fix_to_range[0]
-                    larger_mask = sample >= fix_to_range[1]
-                    in_range_mask = ~ (smaller_mask | larger_mask).any(1)
-                    throwaway_share += (~in_range_mask[:batch_size_per_gp_sample]).sum()/batch_size_per_gp_sample
-                    if in_range_mask.sum() < batch_size_per_gp_sample:
-                        successful_sample -= 1
-                        if successful_sample < 100:
-                            print("Please change hyper-parameters (e.g. decrease outputscale_mean) it"
-                                "seems like the range is set to tight for your hyper-parameters.")
-                        continue
-                    x[i:i+batch_size_per_gp_sample] = x[i:i+num_candidates][in_range_mask][:batch_size_per_gp_sample]
-                    sample = sample[in_range_mask][:batch_size_per_gp_sample]
-                    samples.append(sample.transpose(0,1))
-                    if not sampling_with_observation_noise: samples_wo_noise.append(sample_wo_noise.transpose(0,1))
-                    successful_sample = True
-        if random.random() < .01:
-            print('throwaway share', throwaway_share/(batch_size//batch_size_per_gp_sample))
-        #print(f'took {time.time() - start}')
-        x = x.view(-1,batch_size,seq_len,num_features)[0]
-        # TODO think about enabling the line below
-        #sample = sample - sample[0, :].unsqueeze(0).expand(*sample.shape)
-        x = x.transpose(0,1)
-        sample = torch.cat(samples, 1)
-        if sampling_with_observation_noise:
-            target_sample = sample
-        else:
-            target_sample = torch.cat(samples_wo_noise, 1)
-        assert x.shape[:2] == sample.shape[:2]
-    return x, sample, target_sample # x.shape = (T,B,H)
-class DataLoader(get_batch_to_dataloader(get_batch)):
-    @torch.no_grad()
-    def validate(self, model, step_size=1, start_pos=0):
-        if isinstance(model.criterion, BarDistribution):
-            (_, x,y), target_y, eval_pos = self.gbm(**self.get_batch_kwargs)
-            model.eval()
-            losses = []
-            for eval_pos in range(start_pos, len(x), step_size):
-                logits = model((x,y), single_eval_pos=eval_pos)
-                means = model.criterion.mean(logits) # num_evals x batch_size
-                mse = nn.MSELoss()
-                losses.append(mse(means[0], target_y[eval_pos]))
-            model.train()
-            return torch.stack(losses)
-        else:
-            return 123.
-@torch.enable_grad()
-def get_fitted_model(x, y, hyperparameters, device):
-    # fit the gaussian process
-    model, likelihood = get_model(x,y,hyperparameters,sample=False)
-    #print(model.covar_module.base_kernel.lengthscale)
-    model.to(device)
-    mll = ExactMarginalLogLikelihood(likelihood, model)
-    model.train()
-    fit_gpytorch_model(mll)
-    #print(model.covar_module.base_kernel.lengthscale)
-    return model, likelihood
-evaluate = functools.partial(fast_gp.evaluate, get_model_on_device=get_fitted_model)
-def get_mcmc_model(x, y, hyperparameters, device, num_samples, warmup_steps, obs=True):
-    from pyro.infer.mcmc import NUTS, MCMC, HMC
-    import pyro
-    x = x.to(device)
-    y = y.to(device)
-    model, likelihood = get_model(x, y, hyperparameters, sample=False)
-    model.to(device)
-    def pyro_model(x, y):
-        sampled_model = model.pyro_sample_from_prior()
-        output = sampled_model.likelihood(sampled_model(x))
-        if obs:
-            return pyro.sample("obs", output, obs=y)
-    nuts_kernel = NUTS(pyro_model)
-    mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps, num_chains=1)
-    #print(x.shape)
-    mcmc_run.run(x, y)
-    #print(mcmc_run.get_samples())
-    model.pyro_load_from_samples(mcmc_run.get_samples()) # pyro.infer wie noah?
-    model.eval()
-    #print(mcmc_run.diagnostics())
-    # test_x = torch.linspace(0, 1, 101).unsqueeze(-1)
-    # test_y = torch.sin(test_x * (2 * math.pi))
-    # expanded_test_x = test_x.unsqueeze(0).repeat(num_samples, 1, 1)
-    # output = model(expanded_test_x)
-    #print(x.shape)
-    return model, likelihood
-    # output = model(x[-1].unsqueeze(1).repeat(1, num_samples 1))
-    # return output.mean
-def get_mean_logdensity(dists, x: torch.Tensor, full_range=None):
-    means = torch.cat([d.mean.squeeze() for d in dists], 0)
-    vars = torch.cat([d.variance.squeeze() for d in dists], 0)
-    assert len(means.shape) == 1 and len(vars.shape) == 1
-    dist = torch.distributions.Normal(means, vars.sqrt())
-    #logprobs = torch.cat([d.log_prob(x) for d in dists], 0)
-    logprobs = dist.log_prob(x)
-    if full_range is not None:
-        used_weight = 1. - (dist.cdf(torch.tensor(full_range[0])) + (1.-dist.cdf(torch.tensor(full_range[1]))))
-        if torch.isinf(-torch.log(used_weight)).any() or torch.isinf(torch.log(used_weight)).any():
-            print('factor is inf', -torch.log(used_weight))
-        logprobs -= torch.log(used_weight)
-    assert len(logprobs.shape) == 1
-    #print(logprobs)
-    return torch.logsumexp(logprobs, 0) - math.log(len(logprobs))
-def evaluate_(x, y, y_non_noisy, hyperparameters=None, device=default_device, num_samples=100, warmup_steps=300,
-              full_range=None, min_seq_len=0, use_likelihood=False, obs=True):
-    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))), gpytorch.settings.fast_pred_var(False):
-        x = x.to(device).double()
-        y = y.to(device).double()
-        start_time = time.time()
-        losses_after_t = [.0] if min_seq_len == 0 else []
-        all_losses = []
-        for t in range(max(min_seq_len,1), len(x)):
-            #print('Timestep', t)
-            loss_sum = 0.
-            step_losses = []
-            start_step = time.time()
-            print(x.shape, y.shape)
-            for b_i in range(x.shape[1]):
-                x_train = x[:t,b_i]
-                y_train = y[:t,b_i]
-                from pyro.infer.mcmc import NUTS, MCMC, HMC
-                import pyro
-                x_train = x_train.to(device)
-                y_train = y_train.to(device)
-                print(x_train.shape, y_train.shape)
-                model, likelihood = get_model(x_train, y_train, hyperparameters, sample=False)
-                model.to(device)
-                def pyro_model(x, y):
-                    sampled_model = model.pyro_sample_from_prior()
-                    output = sampled_model.likelihood(sampled_model(x))
-                    if obs:
-                        return pyro.sample("obs", output, obs=y)
-                nuts_kernel = NUTS(pyro_model)
-                mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps, num_chains=1, disable_progbar=True)
-                # print(x.shape)
-                mcmc_run.run(x_train, y_train)
-                # print(mcmc_run.get_samples())
-                model.pyro_load_from_samples(mcmc_run.get_samples())
-                model.eval()
-                with torch.no_grad():
-                    dists = model(x[t, b_i, :].unsqueeze(
-                        0).repeat(num_samples, 1, 1))
-                    if use_likelihood:
-                        dists = likelihood(dists)
-                    l = -get_mean_logdensity([dists], y[t, b_i].repeat(num_samples), full_range)
-                    print(l)
-                step_losses.append(l.item())
-                #print('loss',l.item())
-                print(f'current average loss at step {t} is {sum(step_losses)/len(step_losses)} with {(time.time()-start_step)/len(step_losses)} s per eval.')
-                loss_sum += l
-            loss_sum /= x.shape[1]
-            all_losses.append(step_losses)
-            print(f'loss after step {t} is {loss_sum}')
-            losses_after_t.append(loss_sum)
-            print(f'losses so far {torch.tensor(losses_after_t)}')
-        return torch.tensor(losses_after_t), time.time() - start_time, all_losses
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--batch_size', type=int)
-    parser.add_argument('--seq_len', type=int)
-    parser.add_argument('--min_seq_len', type=int, default=0)
-    parser.add_argument('--warmup_steps', type=int)
-    parser.add_argument('--num_samples', type=int)
-    parser.add_argument('--min_y', type=int)
-    parser.add_argument('--max_y', type=int)
-    parser.add_argument('--dim', type=int, default=1)
-    parser.add_argument('--use_likelihood', action='store_true')
-    parser.add_argument('--device', default='cpu')
-    parser.add_argument('--outputscale_concentraion', default=2., type=float)
-    parser.add_argument('--noise_concentration', default=1.1, type=float)
-    parser.add_argument('--noise_rate', default=.05, type=float)
-    parser.add_argument('--handmade', action='store_true')
-    parser.add_argument('--no_obs', action='store_true')
-    parser.add_argument('--seed', type=int, default=0)
-    args = parser.parse_args()
-    import pyro
-    import gpytorch
-    print(pyro.__version__)
-    print(gpytorch.__version__)
-    print('min_y:', args.min_y)
-    full_range = (None if args.min_y is None else (args.min_y,args.max_y))
-    hps = {'handmade': args.handmade, 'outputscale_concentration': args.outputscale_concentraion, 'noise_concentration': args.noise_concentration,
-           'noise_rate': args.noise_rate, 'fast_computations': (False,False,False)}
-    if args.seed:
-        torch.manual_seed(args.seed)
-        np.random.seed(args.seed)
-        random.seed(args.seed)
-    x, y, _ = get_batch(args.batch_size, args.seq_len, args.dim, fix_to_range=full_range, hyperparameters=hps)
-    #assert args.seq_len == 7 and args.min_seq_len == 6
-    #x = torch.cat([torch.linspace(0, 1, 6), torch.tensor([.33])]).unsqueeze(1).repeat(1,args.batch_size).unsqueeze(-1)
-    #y = torch.sin(x * (2 * math.pi)).squeeze(-1)
-    print('RESULT:', evaluate_(x, y, y, device=args.device, warmup_steps=args.warmup_steps,
-                               num_samples=args.num_samples, full_range=full_range, min_seq_len=args.min_seq_len,
-                               hyperparameters=hps, use_likelihood=args.use_likelihood, obs=not args.no_obs))

lcpfn/priors/gp.py DELETED Viewed

@@ -1,69 +0,0 @@
-import time
-import random
-import numpy as np
-import torch
-from torch import nn
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel
-from .utils import get_batch_to_dataloader
-length_scale_sampling_gp = .6
-def get_gp(length_scale=None):
-    return GaussianProcessRegressor(
-        kernel=RBF(length_scale=length_scale or length_scale_sampling_gp, length_scale_bounds='fixed'),
-        random_state=0, optimizer=None)
-def get_batch(batch_size, seq_len, num_features, noisy_std=None):
-    # m = torch.normal(0.,.1,size=(batch_size,num_features))
-    # m2 = torch.rand(batch_size,num_features)
-    # b = 0 # torch.rand(batch_size)
-    x_t = torch.rand(batch_size, seq_len, num_features)
-    # gp_b = TensorGP(kernel=TensorRBF(noisy_std))
-    # y_t = gp_b.sample_from_GP_prior(x_t).detach()
-    gpr = get_gp(noisy_std)
-    y_t = torch.zeros(batch_size, seq_len)
-    for i in range(len(y_t)):
-        y_t[i] += gpr.sample_y(x_t[i], random_state=random.randint(0, 2 ** 32)).squeeze()
-    x, y = x_t.transpose(0, 1), y_t.transpose(0, 1)
-    # x, _ = torch.sort(x,dim=0)
-    return x, y, y
-DataLoader = get_batch_to_dataloader(get_batch)
-def evaluate(x, y, y_non_noisy, use_mse=False, length_scale=length_scale_sampling_gp):
-    start_time = time.time()
-    losses_after_t = [.0]
-    for t in range(1, len(x)):
-        loss_sum = 0.
-        for b_i in range(x.shape[1]):
-            gpr = get_gp(length_scale).fit(x[:t, b_i], y[:t, b_i])
-            means, stds = gpr.predict(x[t, b_i].unsqueeze(0), return_std=True)
-            assert len(means) == 1 == len(stds)
-            if use_mse:
-                c = nn.MSELoss()
-                l = c(torch.tensor(means), y[t, b_i].unsqueeze(-1))
-            else:
-                c = nn.GaussianNLLLoss(full=True)
-                l = c(torch.tensor(means), y[t, b_i].unsqueeze(-1),
-                      var=torch.tensor(stds) ** 2)
-            loss_sum += l
-        losses_after_t.append(loss_sum / x.shape[1])
-    return torch.tensor(losses_after_t), time.time()-start_time
-if __name__ == '__main__':
-    ls = .1
-    for alpha in set([ls, ls * 1.1, ls * .9]):
-        print(alpha)
-        for redo_idx in range(1):
-            print(
-                evaluate(*get_batch(1000, 10, noisy_std=ls, num_features=10), use_mse=False, length_scale=alpha))

lcpfn/priors/prior.py DELETED Viewed

@@ -1,25 +0,0 @@
-from abc import ABCMeta, abstractmethod
-from torch.utils.data import DataLoader
-class PriorDataLoader(DataLoader, metaclass=ABCMeta):
-    @abstractmethod
-    def __init__(self, num_steps, batch_size, eval_pos_seq_len_sampler, seq_len_maximum, device, **kwargs):
-        """
-        :param num_steps: int, first argument, the number of steps to take per epoch, i.e. iteration of the DataLoader
-        :param batch_size: int, number of datasets per batch
-        :param eval_pos_seq_len_sampler: callable, it takes no arguments and returns a tuple (single eval pos, bptt)
-        :param kwargs: for future compatibility it is good to have a final all catch, as new kwargs might be introduced
-        """
-        pass
-    # A class or object variable `num_features`: int
-    # Optional: `validate` function that accepts a transformer model
-    # The DataLoader iter should return batches of the form ([style], x, y), target_y, single_eval_pos
-    # We follow sequence len (s) first, batch size (b) second. So x: (s,b,num_features), y,target_y: (s,b)
-    # and style: Optional[(b,num_style_params)], style can be omitted or set to None, if it is not intended to be used.
-    # For more references, see `priors/utils.py` for a pretty general implementation of a DataLoader
-    # and `train.py` for the only call of it.

lcpfn/priors/pyro.py DELETED Viewed

@@ -1,41 +0,0 @@
-import random
-import torch
-from torch import nn
-from utils import default_device
-from .utils import get_batch_to_dataloader
-def get_batch(batch_size, seq_len, batch_size_per_gp_sample=None, **config):
-    batch_size_per_gp_sample = batch_size_per_gp_sample or batch_size // 16
-    assert batch_size % batch_size_per_gp_sample == 0, 'Please choose a batch_size divisible by batch_size_per_gp_sample.'
-    num_models = batch_size // batch_size_per_gp_sample
-    # standard kaiming uniform init currently...
-    models = [config['model']() for _ in range(num_models)]
-    sample = sum([[model(seq_len=seq_len) for _ in range(0,batch_size_per_gp_sample)] for model in models],[])
-    def normalize_data(data):
-        mean = data.mean(0)
-        std = data.std(0) + .000001
-        eval_xs = (data - mean) / std
-        return eval_xs
-    x, y = zip(*sample)
-    y = torch.stack(y, 1).squeeze(-1).detach()
-    x = torch.stack(x, 1).detach()
-    if 'normalize_y' in config and config['normalize_y']:
-        x, y = normalize_data(x), normalize_data(y)
-    elif 'normalize_y' in config and config['normalize']:
-        x, y = normalize_data(x), y
-    return x, y, y
-DataLoader = get_batch_to_dataloader(get_batch)

lcpfn/priors/ridge.py DELETED Viewed

@@ -1,37 +0,0 @@
-import random
-import time
-import numpy as np
-import torch
-from torch import nn
-from sklearn.linear_model import Ridge
-from .utils import get_batch_to_dataloader
-def get_batch(batch_size, seq_len, num_features, noisy_std = .1):
-    m = torch.normal(0., .1, size=(batch_size,num_features))
-    b = 0 # torch.rand(batch_size)
-    x = torch.rand(seq_len, batch_size,num_features)
-    y_non_noisy = torch.einsum('bf,tbf->tb',m,x)
-    y = y_non_noisy + torch.normal(torch.zeros_like(y_non_noisy),noisy_std) # noisy_std is alpha
-    return x, y, y_non_noisy
-DataLoader = get_batch_to_dataloader(get_batch)
-def evaluate(x,y,y_non_noisy, alpha=0.):
-    start_time = time.time()
-    losses_after_t = [.0]
-    for t in range(1,len(x)):
-        loss_sum = 0.
-        for b_i in range(x.shape[1]):
-            clf = Ridge(alpha=alpha)
-            clf.fit(x[:t,b_i],y[:t,b_i])
-            y_ = clf.predict(x[t,b_i].unsqueeze(0))
-            l = nn.MSELoss()(y_non_noisy[t,b_i].unsqueeze(0),torch.tensor(y_))
-            loss_sum += l
-        losses_after_t.append(loss_sum/x.shape[1])
-    return torch.tensor(losses_after_t), time.time()-start_time
-if __name__ == '__main__':
-    for alpha in [.001,.01,.5,1.]:
-        print(alpha, evaluate(*get_batch(1000,10,noisy_std=.01),alpha=alpha))

lcpfn/priors/stroke.py DELETED Viewed

@@ -1,143 +0,0 @@
-from PIL import Image, ImageDraw, ImageFilter
-import random
-import math
-import torch
-import numpy as np
-from .utils import get_batch_to_dataloader
-def mnist_prior(num_classes=2, size=28, min_max_strokes=(1,3), min_max_len=(5/28,20/28), min_max_start=(2/28,25/28),
-                min_max_width=(1/28,4/28), max_offset=4/28, max_target_offset=2/28):
-    classes = []
-    for i in range(num_classes):
-        num_strokes = random.randint(*min_max_strokes)
-        len_strokes = [random.randint(int(size * min_max_len[0]), int(size * min_max_len[1])) for i in range(num_strokes)]
-        stroke_start_points = [
-            (random.randint(int(size * min_max_start[0]), int(size * min_max_start[1])), random.randint(int(size * min_max_start[0]), int(size * min_max_start[1]))) for i in
-            range(num_strokes)]
-        stroke_directions = []
-        # i = Image.fromarray(np.zeros((28,28),dtype=np.uint8))
-        # draw = ImageDraw.Draw(i)
-        for i in range(num_strokes):
-            sp, length = stroke_start_points[i], len_strokes[i]
-            counter = 0
-            while True:
-                if counter % 3 == 0:
-                    length = random.randint(int(size * min_max_len[0]), int(size * min_max_len[1]))
-                    sp = (
-                    random.randint(int(size * min_max_start[0]), int(size * min_max_start[1])), random.randint(int(size * min_max_start[0]), int(size * min_max_start[1])))
-                    stroke_start_points[i], len_strokes[i] = sp, length
-                radians = random.random() * 2 * math.pi
-                x_vel = math.cos(radians) * length
-                y_vel = math.sin(radians) * length
-                new_p = (sp[0] + x_vel, sp[1] + y_vel)
-                # print(math.degrees(radians),sp,new_p)
-                if not any(n > size - 1 or n < 0 for n in new_p):
-                    break
-                counter += 1
-            stroke_directions.append(radians)
-            # print([round(x) for x in sp+new_p])
-            # draw.line([round(x) for x in sp+new_p], fill=128, width=3)
-        classes.append((len_strokes, stroke_start_points, stroke_directions))
-    generator_functions = []
-    for c in classes:
-        def g(c=c):
-            len_strokes, stroke_start_points, stroke_directions = c
-            i = Image.fromarray(np.zeros((size, size), dtype=np.uint8))
-            draw = ImageDraw.Draw(i)
-            width = random.randint(int(size * min_max_width[0]), int(size * min_max_width[1]))
-            offset = random.randint(int(-size * max_offset), int(size * max_offset)), random.randint(int(- size * max_offset), int(size * max_offset))
-            for sp, length, radians in zip(stroke_start_points, len_strokes, stroke_directions):
-                sp = (sp[0] + offset[0], sp[1] + offset[1])
-                x_vel = math.cos(radians) * length + random.randint(int(-size * max_target_offset), int(size * max_target_offset))
-                y_vel = math.sin(radians) * length + random.randint(int(-size * max_target_offset), int(size * max_target_offset))
-                new_p = (sp[0] + x_vel, sp[1] + y_vel)
-                stroke_directions.append(radians)
-                draw.line([round(x) for x in sp + new_p], fill=128, width=width)
-            a_i = np.array(i)
-            a_i[a_i == 128] = np.random.randint(200, 255, size=a_i.shape)[a_i == 128]
-            return Image.fromarray(a_i).filter(ImageFilter.GaussianBlur(.2))
-        generator_functions.append(g)
-    return generator_functions
-# g1,g2 = mnist_prior(2)
-# for i in [g1() for _ in range(10)]:
-#    display(i.resize((200,200)))
-from torchvision.transforms import ToTensor, ToPILImage
-def normalize(x):
-    return (x-x.mean())/(x.std()+.000001)
-from os import path, listdir
-import random
-def get_batch(batch_size, seq_len, num_features=None, noisy_std=None, only_train_for_last_idx=False, normalize_x=False, num_outputs=2, use_saved_from=None, **kwargs):  # num_features = 28*28=784
-    if use_saved_from is not None:
-        directory = path.join(use_saved_from, f'len_{seq_len}_out_{num_outputs}_features_{num_features}_bs_{batch_size}')
-        filename = random.choice(listdir(directory))
-        return torch.load(path.join(directory,filename))
-    size = math.isqrt(num_features)
-    assert size * size == num_features, 'num_features needs to be the square of an integer.'
-    if only_train_for_last_idx:
-        assert (seq_len-1) % num_outputs == 0
-    # assert seq_len % 2 == 0, "assert seq_len % 2 == 0"
-    batch = []
-    y = []
-    target_y = []
-    for b_i in range(batch_size):
-        gs = mnist_prior(num_outputs, size, **kwargs)
-        if only_train_for_last_idx:
-            generators = [i for i in range(len(gs)) for _ in range((seq_len-1) // num_outputs)]
-            random.shuffle(generators)
-            generators += [random.randint(0, len(gs) - 1)]
-            target = [-100 for _ in generators]
-            target[-1] = generators[-1]
-        else:
-            generators = [random.randint(0, len(gs) - 1) for _ in range(seq_len)]
-            target = generators
-        normalize_or_not = lambda x: normalize(x) if normalize_x else x
-        s = torch.cat([normalize_or_not(ToTensor()(gs[f_i]())) for f_i in generators], 0)
-        batch.append(s)
-        y.append(torch.tensor(generators))
-        target_y.append(torch.tensor(target))
-    x = torch.stack(batch, 1).view(seq_len, batch_size, -1)
-    y = torch.stack(y, 1)
-    target_y = torch.stack(target_y, 1)
-    return x,y,target_y
-DataLoader = get_batch_to_dataloader(get_batch)
-DataLoader.num_outputs = 2
-if __name__ == '__main__':
-    g1, g2 = mnist_prior(2, size=3)
-    # for i in range(10):
-    # print(PILToTensor()(g1()))
-    # display(ToPILImage()(PILToTensor()(g1())).resize((200,200)))
-    # display(g2().resize((200,200)))
-    size = 10
-    x, y = get_batch(1, 10, num_features=size * size)
-    x_ = x[..., :-1].squeeze(1)
-    last_y = x[..., -1].squeeze(1)
-    y = y.squeeze(1)
-    # print(y)
-    for i, y_, last_y_, x__ in zip(x_, y, last_y, x.squeeze(1)):
-        # print(y_)
-        # print(i.shape)
-        # print(x__)
-        img = ToPILImage()(i.view(size, size))
-        # display(img.resize((200,200)))
-    print(y, last_y)

lcpfn/priors/utils.py DELETED Viewed

@@ -1,151 +0,0 @@
-import random
-import pandas as pd
-import torch
-from lcpfn.utils import set_locals_in_self
-from itertools import repeat
-from .prior import PriorDataLoader
-from torch import nn
-import numpy as np
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-import scipy.stats as stats
-import math
-def get_batch_to_dataloader(get_batch_method_):
-    class DL(PriorDataLoader):
-        get_batch_method = get_batch_method_
-        # Caution, you might need to set self.num_features manually if it is not part of the args.
-        def __init__(self, num_steps, **get_batch_kwargs):
-            set_locals_in_self(locals())
-            # The stuff outside the or is set as class attribute before instantiation.
-            self.num_features = get_batch_kwargs.get('num_features') or self.num_features
-            print('DataLoader.__dict__', self.__dict__)
-        @staticmethod
-        def gbm(*args, eval_pos_seq_len_sampler, **kwargs):
-            kwargs['single_eval_pos'], kwargs['seq_len'] = eval_pos_seq_len_sampler()
-            # Scales the batch size dynamically with the power of 'dynamic_batch_size'.
-            # A transformer with quadratic memory usage in the seq len would need a power of 2 to keep memory constant.
-            if 'dynamic_batch_size' in kwargs and kwargs['dynamic_batch_size'] > 0:
-                kwargs['batch_size'] = kwargs['batch_size'] * math.floor(math.pow(kwargs['seq_len_maximum'], kwargs['dynamic_batch_size']) / math.pow(kwargs['seq_len'], kwargs['dynamic_batch_size']))
-            batch = get_batch_method_(*args, **kwargs)
-            x, y, target_y, style = batch if len(batch) == 4 else (batch[0], batch[1], batch[2], None)
-            return (style, x, y), target_y, kwargs['single_eval_pos']
-        def __len__(self):
-            return self.num_steps
-        def __iter__(self):
-            return iter(self.gbm(**self.get_batch_kwargs) for _ in range(self.num_steps))
-    return DL
-"""
-import seaborn as sns
-def plot_features(data, targets, fig=None):
-    if torch.is_tensor(data):
-        data = data.detach().cpu().numpy()
-        targets = targets.detach().cpu().numpy()
-    fig2 = plt.figure(figsize=(8, 8))
-    spec2 = gridspec.GridSpec(ncols=data.shape[1], nrows=data.shape[1], figure=fig2)
-    for d in range(0, data.shape[1]):
-        for d2 in range(0, data.shape[1]):
-            sub_ax = fig2.add_subplot(spec2[d, d2])
-            if d == d2:
-                sns.kdeplot(data[:, d],hue=targets[:],ax=sub_ax,legend=False, palette="deep")
-                sub_ax.set(ylabel=None)
-            else:
-                sns.scatterplot(data[:, d], data[:, d2],
-                           hue=targets[:],legend=False, palette="deep")
-                #plt.scatter(data[:, d], data[:, d2],
-                #               c=targets[:])
-            sub_ax.get_xaxis().set_ticks([])
-            sub_ax.get_yaxis().set_ticks([])
-    plt.subplots_adjust(wspace=0.05, hspace=0.05)
-    fig2.show()
-def plot_prior(prior):
-    s = np.array([prior() for _ in range(0, 1000)])
-    count, bins, ignored = plt.hist(s, 50, density=True)
-    print(s.min())
-    plt.show()
-"""
-trunc_norm_sampler_f = lambda mu, sigma : lambda: stats.truncnorm((0 - mu) / sigma, (1000000 - mu) / sigma, loc=mu, scale=sigma).rvs(1)[0]
-beta_sampler_f = lambda a, b : lambda : np.random.beta(a, b)
-gamma_sampler_f = lambda a, b : lambda : np.random.gamma(a, b)
-uniform_sampler_f = lambda a, b : lambda : np.random.uniform(a, b)
-uniform_int_sampler_f = lambda a, b : lambda : round(np.random.uniform(a, b))
-def zipf_sampler_f(a, b, c):
-    x = np.arange(b, c)
-    weights = x ** (-a)
-    weights /= weights.sum()
-    return lambda : stats.rv_discrete(name='bounded_zipf', values=(x, weights)).rvs(1)
-scaled_beta_sampler_f = lambda a, b, scale, minimum : lambda : minimum + round(beta_sampler_f(a, b)() * (scale - minimum))
-def normalize_by_used_features_f(x, num_features_used, num_features, normalize_with_sqrt=False):
-    if normalize_with_sqrt:
-        return x / (num_features_used / num_features)**(1 / 2)
-    return x / (num_features_used / num_features)
-def order_by_y(x, y):
-    order = torch.argsort(y if random.randint(0, 1) else -y, dim=0)[:, 0, 0]
-    order = order.reshape(2, -1).transpose(0, 1).reshape(-1)#.reshape(seq_len)
-    x = x[order]  # .reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).reshape(seq_len, 1, -1)
-    y = y[order]  # .reshape(2, -1).transpose(0, 1).reshape(-1).reshape(seq_len, 1, -1)
-    return x, y
-def randomize_classes(x, num_classes):
-    classes = torch.arange(0, num_classes, device=x.device)
-    random_classes = torch.randperm(num_classes, device=x.device).type(x.type())
-    x = ((x.unsqueeze(-1) == classes) * random_classes).sum(-1)
-    return x
-class CategoricalActivation(nn.Module):
-    def __init__(self, categorical_p=0.1, ordered_p=0.7
-                 , keep_activation_size=False
-                 , num_classes_sampler=zipf_sampler_f(0.8, 1, 10)):
-        self.categorical_p = categorical_p
-        self.ordered_p = ordered_p
-        self.keep_activation_size = keep_activation_size
-        self.num_classes_sampler = num_classes_sampler
-        super().__init__()
-    def forward(self, x):
-        # x shape: T, B, H
-        x = nn.Softsign()(x)
-        num_classes = self.num_classes_sampler()
-        hid_strength = torch.abs(x).mean(0).unsqueeze(0) if self.keep_activation_size else None
-        categorical_classes = torch.rand((x.shape[1], x.shape[2])) < self.categorical_p
-        class_boundaries = torch.zeros((num_classes - 1, x.shape[1], x.shape[2]), device=x.device, dtype=x.dtype)
-        # Sample a different index for each hidden dimension, but shared for all batches
-        for b in range(x.shape[1]):
-            for h in range(x.shape[2]):
-                ind = torch.randint(0, x.shape[0], (num_classes - 1,))
-                class_boundaries[:, b, h] = x[ind, b, h]
-        for b in range(x.shape[1]):
-            x_rel = x[:, b, categorical_classes[b]]
-            boundaries_rel = class_boundaries[:, b, categorical_classes[b]].unsqueeze(1)
-            x[:, b, categorical_classes[b]] = (x_rel > boundaries_rel).sum(dim=0).float() - num_classes / 2
-        ordered_classes = torch.rand((x.shape[1],x.shape[2])) < self.ordered_p
-        ordered_classes = torch.logical_and(ordered_classes, categorical_classes)
-        x[:, ordered_classes] = randomize_classes(x[:, ordered_classes], num_classes)
-        x = x * hid_strength if self.keep_activation_size else x
-        return x

lcpfn/train.py DELETED Viewed

@@ -1,336 +0,0 @@
-import itertools
-import time
-from contextlib import nullcontext
-import torch
-from torch import nn
-from lcpfn import utils
-from lcpfn.transformer import TransformerModel
-from lcpfn.bar_distribution import (
-    BarDistribution,
-)
-from lcpfn.utils import (
-    get_cosine_schedule_with_warmup,
-    get_openai_lr,
-)
-from lcpfn import positional_encodings
-from lcpfn.utils import init_dist
-from torch.cuda.amp import autocast, GradScaler
-class Losses:
-    gaussian = nn.GaussianNLLLoss(full=True, reduction="none")
-    mse = nn.MSELoss(reduction="none")
-    ce = lambda num_classes: nn.CrossEntropyLoss(
-        reduction="none", weight=torch.ones(num_classes)
-    )
-    bce = nn.BCEWithLogitsLoss(reduction="none")
-    get_BarDistribution = BarDistribution
-def train(
-    priordataloader_class,
-    criterion,
-    encoder_generator,
-    emsize=200,
-    nhid=200,
-    nlayers=6,
-    nhead=2,
-    dropout=0.2,
-    epochs=10,
-    steps_per_epoch=100,
-    batch_size=200,
-    bptt=10,
-    lr=None,
-    weight_decay=0.0,
-    warmup_epochs=10,
-    input_normalization=False,
-    y_encoder_generator=None,
-    pos_encoder_generator=None,
-    decoder=None,
-    extra_prior_kwargs_dict={},
-    scheduler=get_cosine_schedule_with_warmup,
-    load_weights_from_this_state_dict=None,
-    validation_period=10,
-    single_eval_pos_gen=None,
-    bptt_extra_samples=None,
-    gpu_device="cuda:0",
-    aggregate_k_gradients=1,
-    verbose=True,
-    style_encoder_generator=None,
-    epoch_callback=None,
-    initializer=None,
-    initialize_with_model=None,
-    train_mixed_precision=False,
-    saving_period=10,
-    checkpoint_file=None,
-    load_optimizer_from_this_state_dict=None,
-    output_path=None,
-    **model_extra_args,
-):
-    device = gpu_device if torch.cuda.is_available() else "cpu:0"
-    print(f"Using {device} device")
-    using_dist, rank, device = init_dist(device)
-    single_eval_pos_gen = (
-        single_eval_pos_gen
-        if callable(single_eval_pos_gen)
-        else lambda: single_eval_pos_gen
-    )
-    def eval_pos_seq_len_sampler():
-        single_eval_pos = single_eval_pos_gen()
-        if bptt_extra_samples:
-            return single_eval_pos, single_eval_pos + bptt_extra_samples
-        else:
-            return single_eval_pos, bptt
-    dl = priordataloader_class(
-        num_steps=steps_per_epoch,
-        batch_size=batch_size,
-        eval_pos_seq_len_sampler=eval_pos_seq_len_sampler,
-        seq_len_maximum=bptt + (bptt_extra_samples if bptt_extra_samples else 0),
-        device=device,
-        **extra_prior_kwargs_dict,
-    )
-    encoder = encoder_generator(dl.num_features, emsize)
-    style_def = next(iter(dl))[0][
-        0
-    ]  # This is (style, x, y), target with x and y with batch size
-    print(f"Style definition: {style_def}")
-    style_encoder = (
-        style_encoder_generator(hyperparameter_definitions=style_def[0], em_size=emsize)
-        if (style_def is not None)
-        else None
-    )
-    if isinstance(criterion, nn.GaussianNLLLoss):
-        n_out = 2
-    elif (
-        isinstance(criterion, BarDistribution)
-        or "BarDistribution" in criterion.__class__.__name__
-    ):  # TODO remove this fix (only for dev)
-        n_out = criterion.num_bars
-    elif isinstance(criterion, nn.CrossEntropyLoss):
-        n_out = criterion.weight.shape[0]
-    else:
-        n_out = 1
-    model = TransformerModel(
-        encoder,
-        n_out,
-        emsize,
-        nhead,
-        nhid,
-        nlayers,
-        dropout,
-        style_encoder=style_encoder,
-        y_encoder=y_encoder_generator(1, emsize),
-        input_normalization=input_normalization,
-        pos_encoder=(
-            pos_encoder_generator or positional_encodings.NoPositionalEncoding
-        )(emsize, bptt * 2),
-        decoder=decoder,
-        init_method=initializer,
-        **model_extra_args,
-    )
-    model.criterion = criterion
-    if load_weights_from_this_state_dict is not None:
-        model.load_state_dict(load_weights_from_this_state_dict)
-    if initialize_with_model is not None:
-        model.init_from_small_model(initialize_with_model)
-    print(
-        f"Using a Transformer with {sum(p.numel() for p in model.parameters())/1000/1000:.{2}f} M parameters"
-    )
-    try:
-        for (k, v), (k2, v2) in zip(
-            model.state_dict().items(), initialize_with_model.state_dict().items()
-        ):
-            print(k, ((v - v2) / v).abs().mean(), v.shape)
-    except Exception:
-        pass
-    model.to(device)
-    if using_dist:
-        print("Distributed training")
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[rank], output_device=rank, broadcast_buffers=False
-        )
-    # learning rate
-    if lr is None:
-        lr = get_openai_lr(model)
-        print(f"Using OpenAI max lr of {lr}.")
-    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
-    scheduler = scheduler(
-        optimizer, warmup_epochs, epochs if epochs is not None else 100
-    )  # when training for fixed time lr schedule takes 100 steps
-    if load_optimizer_from_this_state_dict is not None:
-        optimizer.load_state_dict(load_optimizer_from_this_state_dict)
-    scaler = GradScaler() if train_mixed_precision else None
-    # check that everything uses up-to-date APIs
-    utils.check_compatibility(dl)
-    def train_epoch():
-        model.train()  # Turn on the train mode
-        total_loss = 0.0
-        total_positional_losses = 0.0
-        total_positional_losses_recorded = 0
-        before_get_batch = time.time()
-        assert (
-            len(dl) % aggregate_k_gradients == 0
-        ), "Please set the number of steps per epoch s.t. `aggregate_k_gradients` divides it."
-        for batch, (data, targets, single_eval_pos) in enumerate(dl):
-            if using_dist and not (
-                batch % aggregate_k_gradients == aggregate_k_gradients - 1
-            ):
-                cm = model.no_sync()
-            else:
-                cm = nullcontext()
-            with cm:
-                time_to_get_batch = time.time() - before_get_batch
-                before_forward = time.time()
-                with autocast(enabled=scaler is not None):
-                    # If style is set to None, it should not be transferred to device
-                    output = model(
-                        tuple(e.to(device) if torch.is_tensor(e) else e for e in data)
-                        if isinstance(data, tuple)
-                        else data.to(device),
-                        single_eval_pos=single_eval_pos,
-                    )
-                    forward_time = time.time() - before_forward
-                    if single_eval_pos is not None:
-                        targets = targets[single_eval_pos:]
-                    if isinstance(criterion, nn.GaussianNLLLoss):
-                        assert (
-                            output.shape[-1] == 2
-                        ), "need to write a little bit of code to handle multiple regression targets at once"
-                        mean_pred = output[..., 0]
-                        var_pred = output[..., 1].abs()
-                        losses = criterion(
-                            mean_pred.flatten(),
-                            targets.to(device).flatten(),
-                            var=var_pred.flatten(),
-                        )
-                    elif isinstance(criterion, (nn.MSELoss, nn.BCEWithLogitsLoss)):
-                        losses = criterion(
-                            output.flatten(), targets.to(device).flatten()
-                        )
-                    elif isinstance(criterion, nn.CrossEntropyLoss):
-                        losses = criterion(
-                            output.reshape(-1, n_out),
-                            targets.to(device).long().flatten(),
-                        )
-                    else:
-                        losses = criterion(output, targets)
-                    losses = losses.view(*output.shape[0:2])
-                    loss = losses.mean() / aggregate_k_gradients
-                if scaler:
-                    loss = scaler.scale(loss)
-                loss.backward()
-                if batch % aggregate_k_gradients == aggregate_k_gradients - 1:
-                    if scaler:
-                        scaler.unscale_(optimizer)
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-                    try:
-                        if scaler:
-                            scaler.step(optimizer)
-                            scaler.update()
-                        else:
-                            optimizer.step()
-                    except:
-                        print("Invalid optimization step encountered")
-                    optimizer.zero_grad()
-                step_time = time.time() - before_forward
-                if not torch.isnan(loss):
-                    total_loss += losses.mean().cpu().detach()
-                    total_positional_losses += (
-                        losses.mean(1).cpu().detach()
-                        if single_eval_pos is None
-                        else nn.functional.one_hot(torch.tensor(single_eval_pos), bptt)
-                        * losses[: bptt - single_eval_pos].mean().cpu().detach()
-                    )
-                    total_positional_losses_recorded += (
-                        torch.ones(bptt)
-                        if single_eval_pos is None
-                        else nn.functional.one_hot(torch.tensor(single_eval_pos), bptt)
-                    )
-            before_get_batch = time.time()
-        return (
-            total_loss / steps_per_epoch,
-            (total_positional_losses / total_positional_losses_recorded).tolist(),
-            time_to_get_batch,
-            forward_time,
-            step_time,
-        )
-    total_loss = float("inf")
-    total_positional_losses = float("inf")
-    list_losses = []
-    try:
-        for epoch in range(1, epochs + 1) if epochs is not None else itertools.count(1):
-            epoch_start_time = time.time()
-            (
-                total_loss,
-                total_positional_losses,
-                time_to_get_batch,
-                forward_time,
-                step_time,
-            ) = train_epoch()
-            list_losses.append(total_loss.item())
-            if hasattr(dl, "validate") and epoch % validation_period == 0:
-                with torch.no_grad():
-                    val_score = dl.validate(model)
-            else:
-                val_score = None
-            if epoch % saving_period == 0 and checkpoint_file is not None:
-                checkpoint = {
-                    "model_state_dict": model.state_dict(),
-                    "optimizer_state_dict": optimizer.state_dict(),
-                    "epoch": epoch,
-                }
-                torch.save(checkpoint, checkpoint_file)
-                full_model_path = checkpoint_file.split(".")[0] + "_full_model.pt"
-                torch.save(model, full_model_path)
-            if verbose:
-                print("-" * 89)
-                print(
-                    f"| end of epoch {epoch:3d} | time: {(time.time() - epoch_start_time):5.2f}s | mean loss {total_loss:5.2f} | "
-                    f"pos losses {','.join([f'{l:5.2f}' for l in total_positional_losses])}, lr {scheduler.get_last_lr()[0]}"
-                    f" data time {time_to_get_batch:5.2f} step time {step_time:5.2f}"
-                    f" forward time {forward_time:5.2f}"
-                    + (f"val score {val_score}" if val_score is not None else "")
-                )
-                print("-" * 89)
-            # stepping with wallclock time based scheduler
-            if epoch_callback is not None and rank == 0:
-                epoch_callback(model, epoch / epochs)
-            scheduler.step()
-    except KeyboardInterrupt:
-        pass
-    if rank == 0:  # trivially true for non-parallel training
-        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
-            model = model.module
-            dl = None
-        if output_path is not None:
-            torch.save(model.to("cpu"), output_path)
-            print("Checkpoint stored at ", output_path)
-        return total_loss, total_positional_losses, model.to("cpu"), dl

lcpfn/train_lcpfn.py DELETED Viewed

@@ -1,96 +0,0 @@
-import math
-from torch import nn
-from lcpfn import bar_distribution, encoders, train
-from lcpfn import utils
-from lcpfn.priors import utils as putils
-def train_lcpfn(
-    get_batch_func,
-    seq_len: int = 100,
-    emsize: int = 512,
-    nlayers: int = 12,
-    num_borders: int = 1000,
-    lr: float = 0.0001,
-    batch_size: int = 100,
-    epochs: int = 1000,
-):
-    """
-    Train a LCPFN model using the specified hyperparameters.
-    Args:
-        get_batch_func (callable): A function that returns a batch of learning curves.
-        seq_len (int, optional): The length of the input sequence. Defaults to 100.
-        emsize (int, optional): The size of the embedding layer. Defaults to 512.
-        nlayers (int, optional): The number of layers in the model. Defaults to 12.
-        num_borders_choices (int, optional): The number of borders to use. Defaults to 1000.
-        lr (float, optional): The learning rate for the optimizer. Defaults to 0.0001.
-        batch_size (int, optional): The batch size for training. Defaults to 100.
-        epochs (int, optional): The number of epochs to train for. Defaults to 1000.
-    Returns:
-        torch.module: The trained model.
-    """
-    hps = {}
-    # PFN training hyperparameters
-    dataloader = putils.get_batch_to_dataloader(get_batch_func)  # type: ignore
-    num_features = 1
-    ys = get_batch_func(
-        10_000,
-        seq_len,
-        num_features,
-        hyperparameters=hps,
-        single_eval_pos=seq_len,
-    )
-    bucket_limits = bar_distribution.get_bucket_limits(num_borders, ys=ys[2])
-    # Discretization of the predictive distributions
-    criterions = {
-        num_features: {
-            num_borders: bar_distribution.FullSupportBarDistribution(bucket_limits)
-        }
-    }
-    config = dict(
-        nlayers=nlayers,
-        priordataloader_class=dataloader,
-        criterion=criterions[num_features][num_borders],
-        encoder_generator=lambda in_dim, out_dim: nn.Sequential(
-            encoders.Normalize(0.0, 101.0),
-            encoders.Normalize(0.5, math.sqrt(1 / 12)),
-            encoders.Linear(in_dim, out_dim),
-        ),
-        emsize=emsize,
-        nhead=(emsize // 128),
-        warmup_epochs=(epochs // 4),
-        y_encoder_generator=encoders.get_normalized_uniform_encoder(encoders.Linear),
-        batch_size=batch_size,
-        scheduler=utils.get_cosine_schedule_with_warmup,
-        extra_prior_kwargs_dict={
-            # "num_workers": 10,
-            "num_features": num_features,
-            "hyperparameters": {
-                **hps,
-            },
-        },
-        epochs=epochs,
-        lr=lr,
-        bptt=seq_len,
-        single_eval_pos_gen=utils.get_uniform_single_eval_pos_sampler(
-            seq_len, min_len=1
-        ),
-        aggregate_k_gradients=1,
-        nhid=(emsize * 2),
-        steps_per_epoch=100,
-        train_mixed_precision=False,
-    )
-    return train.train(**config)

lcpfn/transformer.py DELETED Viewed

@@ -1,348 +0,0 @@
-import math
-from typing import Optional
-import torch
-import torch.nn as nn
-from torch import Tensor
-import torch.nn.functional as F
-from torch.nn import Module, TransformerEncoder
-from lcpfn.layer import TransformerEncoderLayer, _get_activation_fn
-from lcpfn.utils import SeqBN, bool_mask_to_att_mask
-class GELU(nn.Module):
-    def forward(self, input: Tensor) -> Tensor:
-        return F.gelu(input)
-class TransformerModel(nn.Module):
-    def __init__(
-        self,
-        encoder,
-        n_out,
-        ninp,
-        nhead,
-        nhid,
-        nlayers,
-        dropout=0.0,
-        style_encoder=None,
-        y_encoder=None,
-        pos_encoder=None,
-        decoder=None,
-        input_normalization=False,
-        init_method=None,
-        pre_norm=False,
-        activation="gelu",
-        recompute_attn=False,
-        num_global_att_tokens=0,
-        full_attention=False,
-        all_layers_same_init=True,
-    ):
-        super().__init__()
-        self.model_type = "Transformer"
-        encoder_layer_creator = lambda: TransformerEncoderLayer(
-            ninp,
-            nhead,
-            nhid,
-            dropout,
-            activation=activation,
-            pre_norm=pre_norm,
-            recompute_attn=recompute_attn,
-        )
-        self.transformer_encoder = (
-            TransformerEncoder(encoder_layer_creator(), nlayers)
-            if all_layers_same_init
-            else TransformerEncoderDiffInit(encoder_layer_creator, nlayers)
-        )
-        self.ninp = ninp
-        self.encoder = encoder
-        self.y_encoder = y_encoder
-        self.pos_encoder = pos_encoder
-        self.decoder = (
-            decoder(ninp, nhid, n_out)
-            if decoder is not None
-            else nn.Sequential(nn.Linear(ninp, nhid), GELU(), nn.Linear(nhid, n_out))
-        )
-        self.input_ln = SeqBN(ninp) if input_normalization else None
-        self.style_encoder = style_encoder
-        self.init_method = init_method
-        if num_global_att_tokens is not None:
-            assert not full_attention
-        self.global_att_embeddings = (
-            nn.Embedding(num_global_att_tokens, ninp) if num_global_att_tokens else None
-        )
-        self.full_attention = full_attention
-        self.n_out = n_out
-        self.nhid = nhid
-        self.init_weights()
-    @staticmethod
-    def generate_square_subsequent_mask(sz):
-        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        return bool_mask_to_att_mask(mask)
-    @staticmethod
-    def generate_D_q_matrix(sz, query_size):
-        train_size = sz - query_size
-        mask = torch.zeros(sz, sz) == 0
-        mask[:, train_size:].zero_()
-        mask |= torch.eye(sz) == 1
-        return bool_mask_to_att_mask(mask)
-    @staticmethod
-    def generate_global_att_query_matrix(
-        num_global_att_tokens, seq_len, num_query_tokens
-    ):
-        train_size = seq_len + num_global_att_tokens - num_query_tokens
-        sz = seq_len + num_global_att_tokens
-        mask = torch.zeros(num_query_tokens, sz) == 0
-        mask[:, train_size:].zero_()
-        mask[:, train_size:] |= torch.eye(num_query_tokens) == 1
-        return bool_mask_to_att_mask(mask)
-    @staticmethod
-    def generate_global_att_trainset_matrix(
-        num_global_att_tokens, seq_len, num_query_tokens
-    ):
-        train_size = seq_len + num_global_att_tokens - num_query_tokens
-        trainset_size = seq_len - num_query_tokens
-        mask = torch.zeros(trainset_size, num_global_att_tokens) == 0
-        # mask[:,num_global_att_tokens:].zero_()
-        # mask[:,num_global_att_tokens:] |= torch.eye(trainset_size) == 1
-        return bool_mask_to_att_mask(mask)
-    @staticmethod
-    def generate_global_att_globaltokens_matrix(
-        num_global_att_tokens, seq_len, num_query_tokens
-    ):
-        mask = (
-            torch.zeros(
-                num_global_att_tokens,
-                num_global_att_tokens + seq_len - num_query_tokens,
-            )
-            == 0
-        )
-        return bool_mask_to_att_mask(mask)
-    def init_weights(self):
-        initrange = 1.0
-        # if isinstance(self.encoder,EmbeddingEncoder):
-        #    self.encoder.weight.data.uniform_(-initrange, initrange)
-        # self.decoder.bias.data.zero_()
-        # self.decoder.weight.data.uniform_(-initrange, initrange)
-        if self.init_method is not None:
-            self.apply(self.init_method)
-        for layer in self.transformer_encoder.layers:
-            nn.init.zeros_(layer.linear2.weight)
-            nn.init.zeros_(layer.linear2.bias)
-            attns = (
-                layer.self_attn
-                if isinstance(layer.self_attn, nn.ModuleList)
-                else [layer.self_attn]
-            )
-            for attn in attns:
-                nn.init.zeros_(attn.out_proj.weight)
-                nn.init.zeros_(attn.out_proj.bias)
-    def forward(self, src, src_mask=None, single_eval_pos=None):
-        assert isinstance(
-            src, tuple
-        ), "inputs (src) have to be given as (x,y) or (style,x,y) tuple"
-        if len(src) == 2:  # (x,y) and no style
-            src = (None,) + src
-        style_src, style_src_size = (src[0], (0 if (src[0] is None) else 1))
-        if src_mask is not None:
-            assert self.global_att_embeddings is None or isinstance(src_mask, tuple)
-        if src_mask is None:
-            x_src = src[1]
-            if self.global_att_embeddings is None:
-                full_len = len(x_src) + style_src_size
-                if self.full_attention:
-                    src_mask = bool_mask_to_att_mask(
-                        torch.ones((full_len, full_len), dtype=torch.bool)
-                    ).to(x_src.device)
-                else:
-                    src_mask = self.generate_D_q_matrix(
-                        len(x_src) + style_src_size,
-                        len(x_src) + style_src_size - single_eval_pos,
-                    ).to(x_src.device)
-            else:
-                src_mask_args = (
-                    self.global_att_embeddings.num_embeddings,
-                    len(x_src) + style_src_size,
-                    len(x_src) + style_src_size - single_eval_pos,
-                )
-                src_mask = (
-                    self.generate_global_att_globaltokens_matrix(*src_mask_args).to(
-                        x_src.device
-                    ),
-                    self.generate_global_att_trainset_matrix(*src_mask_args).to(
-                        x_src.device
-                    ),
-                    self.generate_global_att_query_matrix(*src_mask_args).to(
-                        x_src.device
-                    ),
-                )
-        style_src, x_src, y_src = src
-        x_src = self.encoder(x_src)
-        y_src = self.y_encoder(
-            y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src
-        )
-        style_src = (
-            self.style_encoder(style_src).unsqueeze(0)
-            if self.style_encoder
-            else torch.tensor([], device=x_src.device)
-        )
-        global_src = (
-            torch.tensor([], device=x_src.device)
-            if self.global_att_embeddings is None
-            else self.global_att_embeddings.weight.unsqueeze(1).repeat(
-                1, x_src.shape[1], 1
-            )
-        )
-        train_x = x_src[:single_eval_pos] + y_src[:single_eval_pos]
-        src = torch.cat([global_src, style_src, train_x, x_src[single_eval_pos:]], 0)
-        if self.input_ln is not None:
-            src = self.input_ln(src)
-        if self.pos_encoder is not None:
-            src = self.pos_encoder(src)
-        # If we have style input, drop its output
-        output = self.transformer_encoder(src, src_mask)[style_src_size:]
-        output = self.decoder(output)
-        return output[
-            single_eval_pos
-            + (
-                self.global_att_embeddings.num_embeddings
-                if self.global_att_embeddings
-                else 0
-            ) :
-        ]
-    @torch.no_grad()
-    def init_from_small_model(self, small_model):
-        assert (
-            isinstance(self.decoder, nn.Linear)
-            and isinstance(self.encoder, (nn.Linear, nn.Sequential))
-            and isinstance(self.y_encoder, (nn.Linear, nn.Sequential))
-        )
-        def set_encoder_weights(my_encoder, small_model_encoder):
-            my_encoder_linear, small_encoder_linear = (
-                (my_encoder, small_model_encoder)
-                if isinstance(my_encoder, nn.Linear)
-                else (my_encoder[-1], small_model_encoder[-1])
-            )
-            small_in_dim = small_encoder_linear.out_features
-            my_encoder_linear.weight.zero_()
-            my_encoder_linear.bias.zero_()
-            my_encoder_linear.weight[:small_in_dim] = small_encoder_linear.weight
-            my_encoder_linear.bias[:small_in_dim] = small_encoder_linear.bias
-        set_encoder_weights(self.encoder, small_model.encoder)
-        set_encoder_weights(self.y_encoder, small_model.y_encoder)
-        small_in_dim = small_model.decoder.in_features
-        self.decoder.weight[:, :small_in_dim] = small_model.decoder.weight
-        self.decoder.bias = small_model.decoder.bias
-        for my_layer, small_layer in zip(
-            self.transformer_encoder.layers, small_model.transformer_encoder.layers
-        ):
-            small_hid_dim = small_layer.linear1.out_features
-            my_in_dim = my_layer.linear1.in_features
-            # packed along q,k,v order in first dim
-            my_in_proj_w = my_layer.self_attn.in_proj_weight
-            small_in_proj_w = small_layer.self_attn.in_proj_weight
-            my_in_proj_w.view(3, my_in_dim, my_in_dim)[
-                :, :small_in_dim, :small_in_dim
-            ] = small_in_proj_w.view(3, small_in_dim, small_in_dim)
-            my_layer.self_attn.in_proj_bias.view(3, my_in_dim)[:, :small_in_dim] = (
-                small_layer.self_attn.in_proj_bias.view(3, small_in_dim)
-            )
-            my_layer.self_attn.out_proj.weight[:small_in_dim, :small_in_dim] = (
-                small_layer.self_attn.out_proj.weight
-            )
-            my_layer.self_attn.out_proj.bias[:small_in_dim] = (
-                small_layer.self_attn.out_proj.bias
-            )
-            my_layer.linear1.weight[:small_hid_dim, :small_in_dim] = (
-                small_layer.linear1.weight
-            )
-            my_layer.linear1.bias[:small_hid_dim] = small_layer.linear1.bias
-            my_layer.linear2.weight[:small_in_dim, :small_hid_dim] = (
-                small_layer.linear2.weight
-            )
-            my_layer.linear2.bias[:small_in_dim] = small_layer.linear2.bias
-            my_layer.norm1.weight[:small_in_dim] = (
-                math.sqrt(small_in_dim / my_in_dim) * small_layer.norm1.weight
-            )
-            my_layer.norm2.weight[:small_in_dim] = (
-                math.sqrt(small_in_dim / my_in_dim) * small_layer.norm2.weight
-            )
-            my_layer.norm1.bias[:small_in_dim] = small_layer.norm1.bias
-            my_layer.norm2.bias[:small_in_dim] = small_layer.norm2.bias
-class TransformerEncoderDiffInit(Module):
-    r"""TransformerEncoder is a stack of N encoder layers
-    Args:
-        encoder_layer_creator: a function generating objects of TransformerEncoderLayer class without args (required).
-        num_layers: the number of sub-encoder-layers in the encoder (required).
-        norm: the layer normalization component (optional).
-    """
-    __constants__ = ["norm"]
-    def __init__(self, encoder_layer_creator, num_layers, norm=None):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [encoder_layer_creator() for _ in range(num_layers)]
-        )
-        self.num_layers = num_layers
-        self.norm = norm
-    def forward(
-        self,
-        src: Tensor,
-        mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        r"""Pass the input through the encoder layers in turn.
-        Args:
-            src: the sequence to the encoder (required).
-            mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        output = src
-        for mod in self.layers:
-            output = mod(
-                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
-            )
-        if self.norm is not None:
-            output = self.norm(output)
-        return output

lcpfn/utils.py DELETED Viewed

@@ -1,409 +0,0 @@
-import os
-import math
-import argparse
-import random
-import datetime
-import torch
-from torch import nn
-from torch.optim.lr_scheduler import LambdaLR
-import numpy as np
-# copied from huggingface
-def get_cosine_schedule_with_warmup(
-    optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1
-):
-    """Create a schedule with a learning rate that decreases following the
-    values of the cosine function between 0 and `pi * cycles` after a warmup
-    period during which it increases linearly between 0 and 1.
-    """
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(
-            max(1, num_training_steps - num_warmup_steps)
-        )
-        return max(
-            0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
-        )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-# copied from huggingface
-def get_linear_schedule_with_warmup(
-    optimizer, num_warmup_steps, num_training_steps, last_epoch=-1
-):
-    """
-    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
-    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
-    Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (:obj:`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (:obj:`int`):
-            The total number of training steps.
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
-            The index of the last epoch when resuming training.
-    Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        return max(
-            0.0,
-            float(num_training_steps - current_step)
-            / float(max(1, num_training_steps - num_warmup_steps)),
-        )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-def get_openai_lr(transformer_model):
-    num_params = sum(p.numel() for p in transformer_model.parameters())
-    return 0.003239 - 0.0001395 * math.log(num_params)
-def get_weighted_single_eval_pos_sampler(max_len):
-    """
-    This gives a sampler that can be used for `single_eval_pos` which yields good performance for all positions p,
-    where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer.
-    :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
-    """
-    return lambda: random.choices(
-        range(max_len), [1 / (max_len - i) for i in range(max_len)]
-    )[0]
-def get_uniform_single_eval_pos_sampler(max_len, min_len=0):
-    """
-    Just sample any evaluation position with the same weight
-    :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
-    """
-    return lambda: random.choices(range(min_len, max_len))[0]
-class SeqBN(nn.Module):
-    def __init__(self, d_model):
-        super().__init__()
-        self.bn = nn.BatchNorm1d(d_model)
-        self.d_model = d_model
-    def forward(self, x):
-        assert self.d_model == x.shape[-1]
-        flat_x = x.view(-1, self.d_model)
-        flat_x = self.bn(flat_x)
-        return flat_x.view(*x.shape)
-def set_locals_in_self(locals):
-    """
-    Call this function like `set_locals_in_self(locals())` to set all local variables as object variables.
-    Especially useful right at the beginning of `__init__`.
-    :param locals: `locals()`
-    """
-    self = locals["self"]
-    for var_name, val in locals.items():
-        if var_name != "self":
-            setattr(self, var_name, val)
-default_device = "cuda:0" if torch.cuda.is_available() else "cpu:0"
-# Copied from StackOverflow, but we do an eval on the values additionally
-class StoreDictKeyPair(argparse.Action):
-    def __init__(self, option_strings, dest, nargs=None, **kwargs):
-        self._nargs = nargs
-        super(StoreDictKeyPair, self).__init__(
-            option_strings, dest, nargs=nargs, **kwargs
-        )
-    def __call__(self, parser, namespace, values, option_string=None):
-        my_dict = {}
-        for kv in values:
-            k, v = kv.split("=")
-            try:
-                my_dict[k] = eval(v)
-            except NameError:
-                my_dict[k] = v
-        setattr(namespace, self.dest, my_dict)
-        print("dict values: {}".format(my_dict))
-def get_nan_value(v, set_value_to_nan=0.0):
-    if random.random() < set_value_to_nan:
-        return v
-    else:
-        return random.choice([-999, 0, 1, 999])
-def to_ranking(data):
-    x = data >= data.unsqueeze(-3)
-    x = x.sum(0)
-    return x
-# TODO: Is there a better way to do this?
-#   1. Cmparing to unique elements: When all values are different we still get quadratic blowup
-#   2. Argsort(Argsort()) returns ranking, but with duplicate values there is an ordering which is problematic
-#   3. Argsort(Argsort(Unique))->Scatter seems a bit complicated, doesn't have quadratic blowup, but how fast?
-def to_ranking_low_mem(data):
-    x = torch.zeros_like(data)
-    for col in range(data.shape[-1]):
-        x_ = data[:, :, col] >= data[:, :, col].unsqueeze(-2)
-        x_ = x_.sum(0)
-        x[:, :, col] = x_
-    return x
-def nan_handling_missing_for_unknown_reason_value(set_value_to_nan=0.0):
-    return get_nan_value(float("nan"), set_value_to_nan)
-def nan_handling_missing_for_no_reason_value(set_value_to_nan=0.0):
-    return get_nan_value(float("-inf"), set_value_to_nan)
-def nan_handling_missing_for_a_reason_value(set_value_to_nan=0.0):
-    return get_nan_value(float("inf"), set_value_to_nan)
-def torch_nanmean(x, axis=0):
-    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(
-        axis=axis
-    )
-    value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
-    return value / num
-def torch_nanstd(x, axis=0):
-    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(
-        axis=axis
-    )
-    value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
-    mean = value / num
-    mean_broadcast = torch.repeat_interleave(
-        mean.unsqueeze(axis), x.shape[axis], dim=axis
-    )
-    return torch.sqrt(
-        torch.nansum(torch.square(mean_broadcast - x), axis=axis) / (num - 1)
-    )
-def normalize_data(data, normalize_positions=-1):
-    if normalize_positions > 0:
-        mean = torch_nanmean(data[:normalize_positions], axis=0)
-        std = torch_nanstd(data[:normalize_positions], axis=0) + 0.000001
-    else:
-        mean = torch_nanmean(data, axis=0)
-        std = torch_nanstd(data, axis=0) + 0.000001
-    data = (data - mean) / std
-    data = torch.clip(data, min=-100, max=100)
-    return data
-def remove_outliers(X, n_sigma=4):
-    # Expects T, B, H
-    assert len(X.shape) == 3, "X must be T,B,H"
-    # for b in range(X.shape[1]):
-    # for col in range(X.shape[2]):
-    data = X
-    data_mean, data_std = torch_nanmean(data, axis=0), torch_nanstd(data, axis=0)
-    cut_off = data_std * n_sigma
-    lower, upper = data_mean - cut_off, data_mean + cut_off
-    data_clean = X[:].clone()
-    data_clean[torch.logical_or(data > upper, data < lower)] = np.nan
-    data_mean, data_std = (
-        torch_nanmean(data_clean, axis=0),
-        torch_nanstd(data_clean, axis=0),
-    )
-    cut_off = data_std * n_sigma
-    lower, upper = data_mean - cut_off, data_mean + cut_off
-    X = torch.maximum(-torch.log(1 + torch.abs(X)) + lower, X)
-    X = torch.minimum(torch.log(1 + torch.abs(X)) + upper, X)
-    # print(ds[1][data < lower, col], ds[1][data > upper, col], ds[1][~np.isnan(data), col].shape, data_mean, data_std)
-    return X
-def bool_mask_to_att_mask(mask):
-    return (
-        mask.float()
-        .masked_fill(mask == 0, float("-inf"))
-        .masked_fill(mask == 1, float(0.0))
-    )
-def print_on_master_only(is_master):
-    import builtins as __builtin__
-    builtin_print = __builtin__.print
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_master or force:
-            builtin_print(*args, **kwargs)
-    __builtin__.print = print
-def init_dist(device):
-    print("init dist")
-    if "LOCAL_RANK" in os.environ:
-        # launched with torch.distributed.launch
-        rank = int(os.environ["LOCAL_RANK"])
-        print("torch.distributed.launch and my rank is", rank)
-        torch.cuda.set_device(rank)
-        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
-        torch.distributed.init_process_group(
-            backend="nccl",
-            init_method="env://",
-            timeout=datetime.timedelta(seconds=20),
-            world_size=torch.cuda.device_count(),
-            rank=rank,
-        )
-        torch.distributed.barrier()
-        print_on_master_only(rank == 0)
-        print(
-            f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
-            "only I can print, but when using print(..., force=True) it will print on all ranks."
-        )
-        return True, rank, f"cuda:{rank}"
-    elif "SLURM_PROCID" in os.environ and torch.cuda.device_count() > 1:
-        # this is for multi gpu when starting with submitit
-        assert device != "cpu:0"
-        rank = int(os.environ["SLURM_PROCID"])
-        os.environ["MASTER_ADDR"] = "localhost"
-        os.environ["MASTER_PORT"] = "12355"
-        torch.cuda.set_device(rank)
-        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
-        print("distributed submitit launch and my rank is", rank)
-        torch.distributed.init_process_group(
-            backend="nccl",
-            init_method="env://",
-            timeout=datetime.timedelta(seconds=20),
-            world_size=torch.cuda.device_count(),
-            rank=rank,
-        )
-        torch.distributed.barrier()
-        print_on_master_only(rank == 0)
-        print(
-            f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
-            "only I can print, but when using print(..., force=True) it will print on all ranks."
-        )
-        return True, rank, f"cuda:{rank}"
-    else:
-        print("Not using distributed")
-        # will not change any of the behavior of print, but allows putting the force=True in the print calls
-        print_on_master_only(True)
-        return False, 0, device
-def check_compatibility(dl):
-    if hasattr(dl, "num_outputs"):
-        print(
-            "`num_outputs` for the DataLoader is deprecated. It is assumed to be 1 from now on."
-        )
-        assert dl.num_outputs != 1, (
-            "We assume num_outputs to be 1. Instead of the num_ouputs change your loss."
-            "We specify the number of classes in the CE loss."
-        )
-def pfn_normalize(
-    lb=torch.tensor(float("-inf")),
-    ub=torch.tensor(float("inf")),
-    soft_lb=0.0,
-    soft_ub=1.0,
-    minimize=False,
-):
-    """
-    LC-PFN curve prior assumes curves to be normalized within the range [0,1] and to be maximized.
-    This function allows to normalize and denormalize data to fit this assumption.
-    Parameters:
-        lb (torch.Tensor): Lower bound of the data.
-        ub (torch.Tensor): Upper bound of the data.
-        soft_lb (float): Soft lower bound for normalization. Default is 0.0.
-        soft_ub (float): Soft upper bound for normalization. Default is 1.0.
-        minimize (bool): If True, the original curve is a minization. Default is False.
-    Returns: Two functions for normalizing and denormalizing the data.
-    """
-    assert lb <= soft_lb and soft_lb < soft_ub and soft_ub <= ub
-    # step 1: linearly transform [soft_lb,soft_ub] [-1,1] (where the sigmoid behaves approx linearly)
-    #    2.0/(soft_ub - soft_lb)*(x - soft_lb) - 1.0
-    # step 2: apply a vertically scaled/shifted the sigmoid such that [lb,ub] --> [0,1]
-    def cinv(x):
-        return 1 - x if minimize else x
-    def lin_soft(x):
-        return 2 / (soft_ub - soft_lb) * (x - soft_lb) - 1
-    def lin_soft_inv(y):
-        return (y + 1) / 2 * (soft_ub - soft_lb) + soft_lb
-    try:
-        if torch.exp(-lin_soft(lb)) > 1e300:
-            raise RuntimeError
-        # otherwise overflow causes issues, treat these cases as if the lower bound was -infinite
-        # print(f"WARNING: {lb} --> NINF to avoid overflows ({np.exp(-lin_soft(lb))})")
-    except RuntimeError:
-        lb = torch.tensor(float("-inf"))
-    if torch.isinf(lb) and torch.isinf(ub):
-        return lambda x: cinv(
-            1 / (1 + torch.exp(-lin_soft(x)))
-        ), lambda y: lin_soft_inv(torch.log(cinv(y) / (1 - cinv(y))))
-    elif torch.isinf(lb):
-        a = 1 + torch.exp(-lin_soft(ub))
-        return lambda x: cinv(
-            a / (1 + torch.exp(-lin_soft(x)))
-        ), lambda y: lin_soft_inv(torch.log((cinv(y) / a) / (1 - (cinv(y) / a))))
-    elif torch.isinf(ub):
-        a = 1 / (1 - 1 / (1 + torch.exp(-lin_soft(lb))))
-        b = 1 - a
-        return lambda x: cinv(
-            a / (1 + torch.exp(-lin_soft(x))) + b
-        ), lambda y: lin_soft_inv(
-            torch.log(((cinv(y) - b) / a) / (1 - ((cinv(y) - b) / a)))
-        )
-    else:
-        a = (
-            1
-            + torch.exp(-lin_soft(ub))
-            + torch.exp(-lin_soft(lb))
-            + torch.exp(-lin_soft(ub) - lin_soft(lb))
-        ) / (torch.exp(-lin_soft(lb)) - torch.exp(-lin_soft(ub)))
-        b = -a / (1 + torch.exp(-lin_soft(lb)))
-        return lambda x: cinv(
-            a / (1 + torch.exp(-lin_soft(x))) + b
-        ), lambda y: lin_soft_inv(
-            torch.log(((cinv(y) - b) / a) / (1 - ((cinv(y) - b) / a)))
-        )
-def get_default_normalizer():
-    default_normalizer_kwargs = {
-        "lb": torch.tensor(0.0),
-        "ub": torch.tensor(1.0),
-        "soft_lb": 0.0,
-        "soft_ub": 1.0,
-        "minimize": False,
-    }
-    return pfn_normalize(**default_normalizer_kwargs)
-def identity_normalizer():
-    return lambda x: x, lambda x: x

lcpfn/version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.1.3"

pyproject.toml DELETED Viewed

@@ -1,42 +0,0 @@
-[project]
-name = "lcpfn"
-description = "In-context Bayesian Learning Curve Extrapolation"
-readme = {file = "readme.md", content-type = 'text/markdown'}
-license = {file = "LICENSE"}
-authors = [
-    {name = "Steven Adriaensen", email= "adriaens@cs.uni-freiburg.de"},
-    {name = "Herilalaina Rakotoarison", email = "rakotoah@cs.uni-freiburg.de"},
-    {name = "Samuel Müller", email = "muellesa@cs.uni-freiburg.de"},
-    {name = "Frank Hutter", email = "fh@cs.uni-freiburg.de"},
-]
-requires-python = ">=3.9,<3.12"
-dependencies = [
-    "torch<=1.11.0",
-    "numpy>=1.21.2,<2",
-    "requests>=2.23.0"
-]
-dynamic = ["version"]
-classifiers = [
-  'Intended Audience :: Science/Research',
-  'License :: OSI Approved :: MIT License',
-  'Programming Language :: Python',
-  'Topic :: Software Development',
-  'Topic :: Scientific/Engineering',
-  'Operating System :: Unix',
-  'Operating System :: MacOS',
-  'Programming Language :: Python :: 3',
-  'Programming Language :: Python :: 3.9',
-  'Programming Language :: Python :: 3.10',
-  'Programming Language :: Python :: 3.11',
-]
-[project.urls]
-homepage = "https://github.com/automl/lcpfn"
-repository = "https://github.com/automl/lcpfn"
-bugtracker = "https://github.com/automl/lcpfn/issues"
-[tool.setuptools.packages.find]
-include = ["lcpfn*"]
-[tool.setuptools.dynamic]
-version = {attr = "lcpfn.version.__version__"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch<=1.11.0
+numpy>=1.21.2,<2
+lcpfn==0.1.3