Spaces:

herilalaina
/

lcpfn

Sleeping

App Files Files Community

herilalaina commited on Dec 8, 2023

Commit

b1beb2e

•

1 Parent(s): 8d12475

add demos

Browse files

Files changed (46) hide show

app.py +101 -0
lcpfn/.ipynb_checkpoints/__init__-checkpoint.py +53 -0
lcpfn/.ipynb_checkpoints/curves-checkpoint.py +277 -0
lcpfn/.ipynb_checkpoints/domhan_prior-checkpoint.py +195 -0
lcpfn/__init__.py +53 -0
lcpfn/__pycache__/__init__.cpython-310.pyc +0 -0
lcpfn/__pycache__/bar_distribution.cpython-310.pyc +0 -0
lcpfn/__pycache__/curves.cpython-310.pyc +0 -0
lcpfn/__pycache__/domhan_prior.cpython-310.pyc +0 -0
lcpfn/__pycache__/encoders.cpython-310.pyc +0 -0
lcpfn/__pycache__/layer.cpython-310.pyc +0 -0
lcpfn/__pycache__/model.cpython-310.pyc +0 -0
lcpfn/__pycache__/positional_encodings.cpython-310.pyc +0 -0
lcpfn/__pycache__/train.cpython-310.pyc +0 -0
lcpfn/__pycache__/train_lcpfn.cpython-310.pyc +0 -0
lcpfn/__pycache__/transformer.cpython-310.pyc +0 -0
lcpfn/__pycache__/utils.cpython-310.pyc +0 -0
lcpfn/bar_distribution.py +269 -0
lcpfn/curves.py +277 -0
lcpfn/decoders.py +30 -0
lcpfn/domhan_prior.py +195 -0
lcpfn/encoders.py +161 -0
lcpfn/initializers.py +9 -0
lcpfn/layer.py +126 -0
lcpfn/model.py +29 -0
lcpfn/positional_encodings.py +70 -0
lcpfn/priors/__init__.py +1 -0
lcpfn/priors/__pycache__/__init__.cpython-310.pyc +0 -0
lcpfn/priors/__pycache__/gp.cpython-310.pyc +0 -0
lcpfn/priors/__pycache__/prior.cpython-310.pyc +0 -0
lcpfn/priors/__pycache__/ridge.cpython-310.pyc +0 -0
lcpfn/priors/__pycache__/utils.cpython-310.pyc +0 -0
lcpfn/priors/binarized_regression.py +19 -0
lcpfn/priors/fast_gp.py +143 -0
lcpfn/priors/fast_gp_mix.py +394 -0
lcpfn/priors/gp.py +69 -0
lcpfn/priors/prior.py +25 -0
lcpfn/priors/pyro.py +41 -0
lcpfn/priors/ridge.py +37 -0
lcpfn/priors/stroke.py +143 -0
lcpfn/priors/utils.py +151 -0
lcpfn/train.py +602 -0
lcpfn/train_lcpfn.py +92 -0
lcpfn/transformer.py +226 -0
lcpfn/utils.py +258 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import lcpfn
+import torch
+model = lcpfn.LCPFN()
+def line_plot_fn(data, cutoff, ci_form):
+    cutoff = int(cutoff)
+    ci = int(ci_form)
+    empty_values = list(data[data.y == ""].index)
+    if len(empty_values) > 0:
+        if (len(empty_values) == 1 and empty_values[0] != 49) or (len(empty_values) > 1 and not all(y-x==1 for x,y in zip(empty_values, empty_values[1:]))):
+            raise gr.Error("Please enter a valid learning curve.")
+        else:
+            data = data[data.y != ""]
+    if len(data) < cutoff:
+        raise gr.Error(f"Cutoff ({cutoff}) cannot be greater than the number of data points ({len(data)}).")
+    try:
+        data["y"] = data["y"].astype(float)
+    except:
+        raise gr.Error("Please enter a valid learning curve.")
+    x = torch.arange(1, 51).unsqueeze(1)
+    y = torch.from_numpy(data.y.values).float().unsqueeze(1)
+    rest_prob = (1 - (ci / 100)) / 2
+    predictions = model.predict_quantiles(x_train=x[:cutoff], y_train=y[:cutoff], x_test=x[(cutoff-1):], qs=[rest_prob, 0.5, 1-rest_prob])
+    fig, ax = plt.subplots()
+    ax.plot(x, data.y, "black", label="target")
+    # plot extrapolation
+    ax.plot(x[(cutoff-1):], predictions[:, 1], "blue", label="Extrapolation by PFN")
+    ax.fill_between(
+            x[(cutoff-1):].flatten(), predictions[:, 0], predictions[:, 2], color="blue", alpha=0.2, label="CI of 90%"
+    )
+    # plot cutoff
+    ax.vlines(cutoff, 0, 1, linewidth=0.5, color="k", label="cutoff", linestyles="dashed")
+    ax.set_ylim(0, 1)
+    ax.set_xlim(0, 50)
+    ax.legend(loc="lower right")
+    ax.set_xlabel("t")
+    ax.set_ylabel("y")
+    return fig
+prior = lcpfn.sample_from_prior(np.random)
+curve, _ = prior()
+examples = []
+for _ in range(10):
+    prior = lcpfn.sample_from_prior(np.random)
+    curve, _ = prior()
+    if np.random.rand() < 0.5:
+        curve = _
+    df = pd.DataFrame.from_records(curve[:50][..., np.newaxis], columns=["y"])
+    df["t"] = [i for i in range(1, 50 + 1)]
+    examples.append([df[["t", "y"]], 10])
+with gr.Column() as components:
+    gr.Number(value=10)
+    gr.Number(value=10)
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            dataform = gr.Dataframe(
+                    value=examples[0][0],
+                    headers=["t", "y"],
+                    datatype=["number", "number"],
+                    row_count=(50, "fixed"),
+                    col_count=(2, "fixed"),
+                    type="pandas",
+                )
+            with gr.Row():
+                cutoffform = gr.Number(label="cutoff", value=10)
+                ci_form = gr.Dropdown(label="Confidence Interval", choices=[
+                    ("90%", 90),
+                    ("95%", 95),
+                    ("99%", 99)
+                ], value=90)
+            btn = gr.Button("Run")
+        outputform = gr.Plot()
+    btn.click(fn=line_plot_fn, inputs=[dataform, cutoffform, ci_form], outputs=outputform)
+    gr.Examples(examples, inputs=[dataform], label="Examples of synthetic learning curves")
+if __name__ == "__main__":
+    demo.launch()

lcpfn/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os, sys
+sys.path.insert(0, os.path.dirname(__file__))
+model_path = 'trained_models'
+def prepare_models():
+    pfns4bo_dir = os.path.dirname(__file__)
+    model_names = ['pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt',
+                   'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt']
+    for name in model_names:
+        weights_path = os.path.join(pfns4bo_dir, model_path, name)
+        compressed_weights_path = os.path.join(pfns4bo_dir, model_path, name + '.gz')
+        if not os.path.exists(weights_path):
+            if not os.path.exists(compressed_weights_path):
+                print("Downloading", os.path.abspath(compressed_weights_path))
+                import requests
+                url = f'https://github.com/automl/lcpfn/raw/main/lcpfn/trained_models/{name + ".gz"}'
+                r = requests.get(url, allow_redirects=True)
+                os.makedirs(os.path.dirname(compressed_weights_path), exist_ok=True)
+                with open(compressed_weights_path, 'wb') as f:
+                    f.write(r.content)
+            if os.path.exists(compressed_weights_path):
+                print("Unzipping", name)
+                os.system(f"gzip -dk {compressed_weights_path}")
+            else:
+                print("Failed to find", compressed_weights_path)
+                print("Make sure you have an internet connection to download the model automatically..")
+        if os.path.exists(weights_path):
+            print("Successfully located model at", weights_path)
+model_dict = {
+    'EMSIZE512_NLAYERS12_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
+                                              'pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt'),
+    'EMSIZE512_NLAYERS6_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
+                                    'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt'),
+}
+def __getattr__(name):
+    if name in model_dict:
+        if not os.path.exists(model_dict[name]):
+            print("Can't find", os.path.abspath(model_dict[name]), "thus unzipping/downloading models now.")
+            print("This might take a while..")
+            prepare_models()
+        return model_dict[name]
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+from lcpfn.model import LCPFN
+from lcpfn.train_lcpfn import train_lcpfn
+from lcpfn.domhan_prior import sample_from_prior, create_get_batch_func

lcpfn/.ipynb_checkpoints/curves-checkpoint.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import numpy as np
+from collections import OrderedDict
+prior = {
+    "pow3": {
+        "uniform": OrderedDict(
+            a={"type": "uniform", "param1": -1, "param2": 1},
+            c={"type": "uniform", "param1": 0, "param2": 1},
+            alpha={"type": "uniform", "param1": 0, "param2": 1},
+        ),
+        "peaked": OrderedDict(
+            a={"type": "uniform", "param1": -0.6, "param2": 0.6},
+            c={"type": "uniform", "param1": 0, "param2": 1.25},
+            alpha={"type": "log_normal", "param1": 0, "param2": 2},
+        ),
+    },
+    "ilog2": {
+        "uniform": OrderedDict(
+            c={"type": "uniform", "param1": 0, "param2": 1},
+            a={"type": "uniform", "param1": -1, "param2": 1},
+        ),
+        "peaked": OrderedDict(
+            c={"type": "uniform", "param1": 0, "param2": 1},
+            a={"type": "uniform", "param1": -0.5, "param2": 0.5},
+        ),
+    },
+    "janoschek": {
+        "uniform": OrderedDict(
+            a={"type": "uniform", "param1": 0, "param2": 1},
+            beta={"type": "uniform", "param1": 0, "param2": 2},
+            k={"type": "uniform", "param1": 0, "param2": 1},
+            delta={"type": "uniform", "param1": -5, "param2": 5},
+        ),
+        "peaked": OrderedDict(
+            a={"type": "uniform", "param1": 0, "param2": 1},
+            beta={"type": "uniform", "param1": 0, "param2": 2},
+            k={"type": "log_normal", "param1": -2, "param2": 1},
+            delta={"type": "log_normal", "param1": 0, "param2": 0.5},
+        ),
+    },
+}
+def prior_sampler(rng, type, param1, param2):
+    if type == "uniform":
+        return rng.uniform(param1, param2)
+    elif type == "log_normal":
+        return rng.lognormal(param1, param2)
+    raise Exception("Unknown prior type: {}".format(type))
+def pow3(x, c, a, alpha):
+    return c - a * (x) ** (-alpha)
+def prior_pow3(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["pow3"]["peaked"][p]["type"],
+            param1=prior["pow3"]["peaked"][p]["param1"],
+            param2=prior["pow3"]["peaked"][p]["param2"],
+        )
+        for p in ["a", "c", "alpha"]
+    }
+def uniform_prior_pow3(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["pow3"]["uniform"][p]["type"],
+            param1=prior["pow3"]["uniform"][p]["param1"],
+            param2=prior["pow3"]["uniform"][p]["param2"],
+        )
+        for p in ["a", "c", "alpha"]
+    }
+def ilog2(x, c, a):
+    return c - a / (np.log(x + 1))
+def prior_ilog2(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["ilog2"]["peaked"][p]["type"],
+            param1=prior["ilog2"]["peaked"][p]["param1"],
+            param2=prior["ilog2"]["peaked"][p]["param2"],
+        )
+        for p in ["a", "c"]
+    }
+def uniform_prior_ilog2(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["ilog2"]["uniform"][p]["type"],
+            param1=prior["ilog2"]["uniform"][p]["param1"],
+            param2=prior["ilog2"]["uniform"][p]["param2"],
+        )
+        for p in ["a", "c"]
+    }
+def janoschek(x, a, beta, k, delta):
+    """
+    http://www.pisces-conservation.com/growthhelp/janoschek.htm
+    """
+    return a - (a - beta) * np.exp(-k * x**delta)
+def prior_janoschek(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["janoschek"]["peaked"][p]["type"],
+            param1=prior["janoschek"]["peaked"][p]["param1"],
+            param2=prior["janoschek"]["peaked"][p]["param2"],
+        )
+        for p in ["a", "beta", "k", "delta"]
+    }
+def uniform_prior_janoschek(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["janoschek"]["uniform"][p]["type"],
+            param1=prior["janoschek"]["uniform"][p]["param1"],
+            param2=prior["janoschek"]["uniform"][p]["param2"],
+        )
+        for p in ["a", "beta", "k", "delta"]
+    }
+def log_power(x, a, b, c):
+    # a: upper bound
+    # c: growth rate
+    # initial = a/ (1 + (1/e^b)^c
+    return a / (1.0 + (x / np.exp(b)) ** c)
+def prior_log_power(rng):
+    # a ~ N(0.8,0.1)
+    # b ~ N(1,1)
+    # c ~ U(-3,0)
+    a = rng.normal(0.8, 0.1)
+    b = rng.normal(1.0, 1.0)
+    c = rng.uniform(-3.0, 0.0)
+    return {"a": a, "b": b, "c": c}
+def weibull(x, alpha, beta, kappa, delta):
+    """
+    Weibull modell
+    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
+    alpha: upper asymptote
+    beta: lower asymptote
+    k: growth rate
+    delta: controls the x-ordinate for the point of inflection
+    """
+    return alpha - (alpha - beta) * np.exp(-((kappa * x) ** delta))
+def prior_weibull(rng):
+    alpha = rng.uniform(0.0, 1.5)
+    beta = rng.uniform(0.0, 1)
+    kappa = np.exp(rng.normal(-2.0, 1.0))
+    delta = np.exp(rng.normal(0, 0.5))
+    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
+def mmf(x, alpha, beta, kappa, delta):
+    """
+    Morgan-Mercer-Flodin
+    description:
+    Nonlinear Regression page 342
+    http://bit.ly/1jodG17
+    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
+    alpha: upper asymptote
+    kappa: growth rate
+    beta: initial value
+    delta: controls the point of inflection
+    """
+    return alpha - (alpha - beta) / (1.0 + (kappa * x) ** delta)
+def prior_mmf(rng):
+    # alpha ~ N(0.8,0.1)
+    # beta ~ N(0.2,0.1)
+    # ln(kappa) ~ N(0,2)
+    # ln(delta) ~ N(0,1)
+    alpha = rng.normal(0.8, 0.1)
+    beta = rng.normal(0.2, 0.1)
+    kappa = np.exp(rng.normal(0, 2))
+    delta = np.exp(rng.normal(0, 1))
+    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
+def vap(x, a, b, c):
+    """Vapor pressure model"""
+    # no upper bound if c > 0
+    # a = ln(upper bound) for c=0
+    # a+b = ln(initial)
+    return np.exp(a + b / x + c * np.log(x))
+def prior_vap(rng):
+    a = rng.uniform(-2.0, 0.0)  # @heri: range check
+    b = rng.uniform(-4.0, 0.0)  # @heri: range check
+    c = np.exp(rng.uniform(-8.0, 0.0))  # @heri: same as weights
+    return {"a": a, "b": b, "c": c}
+def loglog_linear(x, a, b):
+    x = np.log(x)
+    return np.log(a * x + b)
+def prior_loglog_linear(rng):
+    # ln(a) ~ N(-2, 1)
+    # ln(b) ~ U(0, 1)
+    a = np.exp(rng.normal(-2.0, 1.0))
+    b = np.exp(rng.uniform(0.0, 1.0))
+    return {"a": a, "b": b}
+def exp4(x, c, a, b, alpha):
+    return c - np.exp(-a * (x**alpha) + b)
+def prior_exp4(rng):
+    # c ~ N(0.8,0.1)
+    c = rng.normal(0.8, 0.1)
+    # ln(a) ~ N(-2,1)
+    a = np.exp(rng.normal(-2, 1))
+    # ln(alpha) ~ N(0,1)
+    alpha = np.exp(rng.normal(0, 1))
+    # ln(b) ~ N(0,0.5)
+    b = np.exp(rng.normal(0, 0.5))
+    return {"a": a, "b": b, "c": c, "alpha": alpha}
+def pow4(x, c, a, b, alpha):
+    return c - (a * x + b) ** -alpha
+def prior_pow4(rng):
+    # ln(1 - c) ~ U(-5, 0)
+    c = 1 - np.exp(rng.uniform(-5.0, 0))
+    # ln(a) ~ N(-3, 2)
+    a = np.exp(rng.normal(-3.0, 2))
+    # ln(alpha) ~ N(0,1)
+    alpha = np.exp(rng.normal(0, 1))
+    # ln(b) ~ U(0, 1)
+    b = np.exp(rng.uniform(0, 1))
+    return {"a": a, "b": b, "c": c, "alpha": alpha}
+def dr_hill_zero_background(x, theta, eta, kappa):
+    # theta: upper bound
+    # eta: growth rate
+    # initial = theta/(kappa^eta + 1)
+    return (theta * x**eta) / (kappa**eta + x**eta)
+def prior_dr_hill_zero_background(rng):
+    # theta ~ U(1,0) N(0.8,0.1)
+    # ln(eta) ~ N(1,1)
+    # ln(kappa) ~ N(1,2)
+    theta = rng.normal(0.8, 0.1)
+    eta = np.exp(rng.normal(1.0, 1.0))
+    kappa = np.exp(rng.normal(1.0, 2.0))
+    return {"theta": theta, "eta": eta, "kappa": kappa}

lcpfn/.ipynb_checkpoints/domhan_prior-checkpoint.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from functools import partial
+import torch
+import numpy as np
+from lcpfn.curves import (
+    pow3,
+    ilog2,
+    janoschek,
+    log_power,
+    prior_ilog2,
+    uniform_prior_pow3,
+    weibull,
+    mmf,
+    vap,
+    loglog_linear,
+    exp4,
+    pow4,
+    dr_hill_zero_background,
+)
+from lcpfn.curves import (
+    prior_pow3,
+    prior_janoschek,
+    prior_log_power,
+    prior_weibull,
+    prior_mmf,
+    prior_vap,
+    prior_loglog_linear,
+    prior_exp4,
+    prior_pow4,
+    prior_dr_hill_zero_background,
+)
+from lcpfn.curves import (
+    uniform_prior_pow3,
+    uniform_prior_ilog2,
+    uniform_prior_janoschek,
+)
+def prior_weights(
+    rng,
+    components=[
+        "pow3",
+        "ilog2",
+        "janoschek",
+        "log_power",
+        "weibull",
+        "mmf",
+        "vap",
+        "loglog_linear",
+        "exp4",
+        "pow4",
+        "dr_hill_zero_background",
+    ],
+):
+    K = len(components)
+    weights = rng.uniform(0.0, 1, size=(K,))
+    return {f: weights[i] for i, f in enumerate(components)}
+def sample_from_prior(rng, seq_len=100):
+    return sample_prior_comb(
+        rng=rng, seq_len=seq_len, components=["pow3", "ilog2", "janoschek"], distribution="peaked"
+    )
+def sample_prior_comb(
+    rng,
+    components,
+    distribution,
+    var_lnloc=-4,
+    var_lnscale=1,
+    range_constraint=True,
+    seq_len=100,
+):
+    f_components = {
+        "pow3": pow3,
+        "ilog2": ilog2,
+        "janoschek": janoschek,
+        "log_power": log_power,
+        "weibull": weibull,
+        "mmf": mmf,
+        "vap": vap,
+        "loglog_linear": loglog_linear,
+        "exp4": exp4,
+        "pow4": pow4,
+        "dr_hill_zero_background": dr_hill_zero_background,
+    }
+    if distribution == "peaked":
+        f_priors = {
+            "pow3": prior_pow3,
+            "ilog2": prior_ilog2,
+            "janoschek": prior_janoschek,
+            "log_power": prior_log_power,
+            "weibull": prior_weibull,
+            "mmf": prior_mmf,
+            "vap": prior_vap,
+            "loglog_linear": prior_loglog_linear,
+            "exp4": prior_exp4,
+            "pow4": prior_pow4,
+            "dr_hill_zero_background": prior_dr_hill_zero_background,
+        }
+    elif distribution == "uniform":
+        f_priors = {
+            "pow3": uniform_prior_pow3,
+            "ilog2": uniform_prior_ilog2,
+            "janoschek": uniform_prior_janoschek
+        }
+    else:
+        raise NotImplemented()
+    x = np.arange(1, seq_len + 1)
+    while True:
+        # sample the noiseless curve
+        weights = prior_weights(rng, components=components)
+        y = np.zeros(x.shape, dtype="float")
+        kwargs = 0
+        for f, w in weights.items():
+            kwargs = f_priors[f](rng)
+            # print(f_components[f](x, **kwargs))
+            y += w * f_components[f](x, **kwargs)
+        # add noise (can exceed [0,1], but afaik no way to implement this prior in Tobis work)
+        var = np.exp(
+            rng.normal(var_lnloc, var_lnscale)
+        )  # @heri: ln_prob =+ log(normal.pdf(log(var), loc=var_lnloc, scale=var_lnscale))
+        # reject any curves that are non-increasing, exceed the [0,1] range
+        if (
+            y[-1] <= y[0]
+            or (range_constraint and (np.any(y < 0) or np.any(y > 1)))
+            or np.isnan(y).any()
+        ):
+            continue
+        else:
+            break
+    def curve():  # generates a sample from the same model, but with independent noise
+        y_noisy = y + rng.normal(np.zeros_like(y), var)
+        return y, y_noisy
+    return curve
+def generate_prior_dataset(n, prior=sample_prior_comb, seed=42):
+    """
+    Returns a fixed sample from the prior (with fixed seq_len) as an n x seq_len np.ndarray
+    """
+    rng = np.random.RandomState(seed)
+    prior_data = np.stack([prior(rng)()[1] for _ in range(n)])
+    return prior_data
+def create_get_batch_func(prior):
+    return partial(get_batch_domhan, prior=prior)
+# function producing batches for PFN training
+def get_batch_domhan(
+    batch_size,
+    seq_len,
+    num_features,
+    prior,
+    device="cpu",
+    noisy_target=True,
+    **_,
+):
+    assert num_features == 1
+    x = np.arange(1, seq_len + 1)
+    y_target = np.empty((batch_size, seq_len), dtype=float)
+    y_noisy = np.empty((batch_size, seq_len), dtype=float)
+    for i in range(batch_size):
+        curve_func = prior(np.random, seq_len=seq_len)  # uses numpy rng
+        if noisy_target:
+            _, y_noisy[i] = curve_func()
+            y_target[i] = y_noisy[i]
+        else:
+            y_target[i], y_noisy[i] = curve_func()
+    # turn numpy arrays into correctly shaped torch tensors & move them to device
+    x = (
+        torch.arange(1, seq_len + 1)
+        .repeat((num_features, batch_size, 1))
+        .transpose(2, 0)
+        .to(device)
+    )
+    y_target = torch.from_numpy(y_target).transpose(1, 0).to(device)
+    y_noisy = torch.from_numpy(y_noisy).transpose(1, 0).to(device)
+    # changes
+    x = x.float()
+    y_target = y_target.float()
+    y_noisy = y_noisy.float()
+    return x, y_noisy, y_target

lcpfn/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os, sys
+sys.path.insert(0, os.path.dirname(__file__))
+model_path = 'trained_models'
+def prepare_models():
+    pfns4bo_dir = os.path.dirname(__file__)
+    model_names = ['pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt',
+                   'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt']
+    for name in model_names:
+        weights_path = os.path.join(pfns4bo_dir, model_path, name)
+        compressed_weights_path = os.path.join(pfns4bo_dir, model_path, name + '.gz')
+        if not os.path.exists(weights_path):
+            if not os.path.exists(compressed_weights_path):
+                print("Downloading", os.path.abspath(compressed_weights_path))
+                import requests
+                url = f'https://github.com/automl/lcpfn/raw/main/lcpfn/trained_models/{name + ".gz"}'
+                r = requests.get(url, allow_redirects=True)
+                os.makedirs(os.path.dirname(compressed_weights_path), exist_ok=True)
+                with open(compressed_weights_path, 'wb') as f:
+                    f.write(r.content)
+            if os.path.exists(compressed_weights_path):
+                print("Unzipping", name)
+                os.system(f"gzip -dk {compressed_weights_path}")
+            else:
+                print("Failed to find", compressed_weights_path)
+                print("Make sure you have an internet connection to download the model automatically..")
+        if os.path.exists(weights_path):
+            print("Successfully located model at", weights_path)
+model_dict = {
+    'EMSIZE512_NLAYERS12_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
+                                              'pfn_EPOCH1000_EMSIZE512_NLAYERS12_NBUCKETS1000.pt'),
+    'EMSIZE512_NLAYERS6_NBUCKETS1000': os.path.join(os.path.dirname(__file__),model_path,
+                                    'pfn_EPOCH1000_EMSIZE512_NLAYERS6_NBUCKETS1000.pt'),
+}
+def __getattr__(name):
+    if name in model_dict:
+        if not os.path.exists(model_dict[name]):
+            print("Can't find", os.path.abspath(model_dict[name]), "thus unzipping/downloading models now.")
+            print("This might take a while..")
+            prepare_models()
+        return model_dict[name]
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+from lcpfn.model import LCPFN
+from lcpfn.train_lcpfn import train_lcpfn
+from lcpfn.domhan_prior import sample_from_prior, create_get_batch_func

lcpfn/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.03 kB). View file

lcpfn/__pycache__/bar_distribution.cpython-310.pyc ADDED Viewed

Binary file (9.96 kB). View file

lcpfn/__pycache__/curves.cpython-310.pyc ADDED Viewed

Binary file (6.81 kB). View file

lcpfn/__pycache__/domhan_prior.cpython-310.pyc ADDED Viewed

Binary file (3.92 kB). View file

lcpfn/__pycache__/encoders.cpython-310.pyc ADDED Viewed

Binary file (8.02 kB). View file

lcpfn/__pycache__/layer.cpython-310.pyc ADDED Viewed

Binary file (4.64 kB). View file

lcpfn/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (1.8 kB). View file

lcpfn/__pycache__/positional_encodings.cpython-310.pyc ADDED Viewed

Binary file (2.86 kB). View file

lcpfn/__pycache__/train.cpython-310.pyc ADDED Viewed

Binary file (13.5 kB). View file

lcpfn/__pycache__/train_lcpfn.cpython-310.pyc ADDED Viewed

Binary file (2.82 kB). View file

lcpfn/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (8.04 kB). View file

lcpfn/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (10.7 kB). View file

lcpfn/bar_distribution.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import torch
+from torch import nn
+class BarDistribution(nn.Module):
+    def __init__(self, borders: torch.Tensor, smoothing=.0): # here borders should start with min and end with max, where all values lie in (min,max) and are sorted
+        # sorted list of borders
+        super().__init__()
+        assert len(borders.shape) == 1
+        #self.borders = borders
+        self.register_buffer('borders', borders)
+        self.register_buffer('smoothing', torch.tensor(smoothing))
+        #self.bucket_widths = self.borders[1:] - self.borders[:-1]
+        self.register_buffer('bucket_widths', self.borders[1:] - self.borders[:-1])
+        full_width = self.bucket_widths.sum()
+        border_order = torch.argsort(borders)
+        assert (full_width - (self.borders[-1] - self.borders[0])).abs() < 1e-4, f'diff: {full_width - (self.borders[-1] - self.borders[0])}'
+        assert (border_order == torch.arange(len(borders)).to(border_order.device)).all(), "Please provide sorted borders!"
+        self.num_bars = len(borders) - 1
+    def map_to_bucket_idx(self, y):
+        target_sample = torch.searchsorted(self.borders, y) - 1
+        target_sample[y == self.borders[0]] = 0
+        target_sample[y == self.borders[-1]] = self.num_bars - 1
+        return target_sample
+    def forward(self, logits, y): # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
+        target_sample = self.map_to_bucket_idx(y)
+        assert (target_sample >= 0).all() and (target_sample < self.num_bars).all(), f'y {y} not in support set for borders (min_y, max_y) {self.borders}'
+        assert logits.shape[-1] == self.num_bars, f'{logits.shape[-1]} vs {self.num_bars}'
+        bucket_log_probs = torch.log_softmax(logits, -1)
+        scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
+        #print(bucket_log_probs, logits.shape)
+        nll_loss = -scaled_bucket_log_probs.gather(-1,target_sample.unsqueeze(-1)).squeeze(-1)
+        smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
+        smoothing = self.smoothing if self.training else 0.
+        loss = (1. - smoothing) * nll_loss + smoothing * smooth_loss
+        return loss
+    def mean(self, logits):
+        bucket_means = self.borders[:-1] + self.bucket_widths/2
+        p = torch.softmax(logits, -1)
+        return p @ bucket_means
+    def icdf(self, logits, left_prob):
+        """
+        Implementation of the quantile function
+        :param logits: Tensor of any shape, with the last dimension being logits
+        :param left_prob: float: The probability mass to the left of the result.
+        :return: Position with `left_prob` probability weight to the left.
+        """
+        probs = logits.softmax(-1)
+        cumprobs = torch.cumsum(probs, -1)
+        idx = torch.searchsorted(cumprobs, left_prob * torch.ones(*cumprobs.shape[:-1], 1, device = probs.device))\
+            .squeeze(-1).clamp(0, cumprobs.shape[-1] - 1)  # this might not do the right for outliers
+        cumprobs = torch.cat(
+            [torch.zeros(*cumprobs.shape[:-1], 1, device=logits.device), cumprobs], -1
+        )
+        rest_prob = left_prob - cumprobs.gather(-1, idx[..., None]).squeeze(-1)
+        left_border = self.borders[idx]
+        right_border = self.borders[idx+1]
+        return left_border + (right_border - left_border) * rest_prob / probs.gather(-1, idx[..., None]).squeeze(-1)
+    def quantile(self, logits, center_prob=.682):
+        side_probs = (1.-center_prob)/2
+        return torch.stack((self.icdf(logits, side_probs), self.icdf(logits, 1.-side_probs)),-1)
+    def ucb(self, logits, best_f, rest_prob=(1-.682)/2, maximize=True):
+        """
+        UCB utility. Rest Prob is the amount of utility above (below) the confidence interval that is ignored.
+        Higher rest_prob is equivalent to lower beta in the standard GP-UCB formulation.
+        :param logits: Logits, as returned by the Transformer.
+        :param best_f: Only here, since the other utilities have it.
+        :param rest_prob: The amount of utility above (below) the confidence interval that is ignored.
+        The default is equivalent to using GP-UCB with `beta=1`.
+        To get the corresponding `beta`, where `beta` is from
+        the standard GP definition of UCB `ucb_utility = mean + beta * std`,
+        you can use this computation: `beta = math.sqrt(2)*torch.erfinv(torch.tensor(2*rest_prob-1))`.
+        :param maximize:
+        :return: utility
+        """
+        if maximize:
+            rest_prob = 1 - rest_prob
+        return self.icdf(logits, rest_prob)
+    def mode(self, logits):
+        mode_inds = logits.argmax(-1)
+        bucket_means = self.borders[:-1] + self.bucket_widths/2
+        return bucket_means[mode_inds]
+    def ei(self, logits, best_f, maximize=True): # logits: evaluation_points x batch x feature_dim
+        bucket_means = self.borders[:-1] + self.bucket_widths/2
+        if maximize:
+            bucket_contributions = torch.tensor(
+                [max((bucket_max + max(bucket_min, best_f)) / 2 - best_f,0) for
+                 bucket_min, bucket_max, bucket_mean in zip(self.borders[:-1], self.borders[1:], bucket_means)], dtype=logits.dtype, device=logits.device)
+        else:
+            bucket_contributions = torch.tensor(
+                [-min((min(bucket_max,best_f) + bucket_min) / 2 - best_f,0) for # min on max instead of max on min, and compare min < instead of max >
+                 bucket_min, bucket_max, bucket_mean in zip(self.borders[:-1], self.borders[1:], bucket_means)], dtype=logits.dtype, device=logits.device)
+        p = torch.softmax(logits, -1)
+        return p @ bucket_contributions
+    def pi(self, logits, best_f, maximize=True):# logits: evaluation_points x batch x feature_dim
+        """
+        Acquisition Function: Probability of Improvement
+        :param logits: as returned by Transformer
+        :param best_f: best evaluation so far (the incumbent)
+        :param maximize: whether to maximize
+        :return: utility
+        """
+        assert maximize is True
+        p = torch.softmax(logits, -1)
+        border_widths = self.borders[1:] - self.borders[:-1]
+        factor = 1. - ((best_f - self.borders[:-1]) / border_widths).clamp(0., 1.)
+        return (p * factor).sum(-1)
+    def mean_of_square(self, logits):
+        """
+        Computes E[x^2].
+        :param logits: Output of the model.
+        """
+        left_borders = self.borders[:-1]
+        right_borders = self.borders[1:]
+        bucket_mean_of_square = (left_borders.square() + right_borders.square() + left_borders*right_borders)/3.
+        p = torch.softmax(logits, -1)
+        return p @ bucket_mean_of_square
+    def variance(self, logits):
+        return self.mean_of_square(logits) - self.mean(logits).square()
+class FullSupportBarDistribution(BarDistribution):
+    @staticmethod
+    def halfnormal_with_p_weight_before(range_max,p=.5):
+        s = range_max / torch.distributions.HalfNormal(torch.tensor(1.)).icdf(torch.tensor(p))
+        return torch.distributions.HalfNormal(s)
+    def forward(self, logits, y): # gives the negative log density (the _loss_), y: T x B, logits: T x B x self.num_bars
+        assert self.num_bars > 1
+        target_sample = self.map_to_bucket_idx(y)
+        target_sample.clamp_(0,self.num_bars-1)
+        assert logits.shape[-1] == self.num_bars
+        bucket_log_probs = torch.log_softmax(logits, -1)
+        scaled_bucket_log_probs = bucket_log_probs - torch.log(self.bucket_widths)
+        #print(bucket_log_probs, logits.shape)
+        log_probs = scaled_bucket_log_probs.gather(-1,target_sample.unsqueeze(-1)).squeeze(-1)
+        side_normals = (self.halfnormal_with_p_weight_before(self.bucket_widths[0]), self.halfnormal_with_p_weight_before(self.bucket_widths[-1]))
+        # TODO look over it again
+        log_probs[target_sample == 0] += side_normals[0].log_prob((self.borders[1]-y[target_sample == 0]).clamp(min=.00000001)) + torch.log(self.bucket_widths[0])
+        log_probs[target_sample == self.num_bars-1] += side_normals[1].log_prob(y[target_sample == self.num_bars-1]-self.borders[-2]) + torch.log(self.bucket_widths[-1])
+        nll_loss = -log_probs
+        smooth_loss = -scaled_bucket_log_probs.mean(dim=-1)
+        smoothing = self.smoothing if self.training else 0.
+        loss = (1. - smoothing) * nll_loss + smoothing * smooth_loss
+        return loss
+    def mean(self, logits):
+        bucket_means = self.borders[:-1] + self.bucket_widths / 2
+        p = torch.softmax(logits, -1)
+        side_normals = (self.halfnormal_with_p_weight_before(self.bucket_widths[0]),
+                        self.halfnormal_with_p_weight_before(self.bucket_widths[-1]))
+        bucket_means[0] = -side_normals[0].mean + self.borders[1]
+        bucket_means[-1] = side_normals[1].mean + self.borders[-2]
+        return p @ bucket_means
+def get_bucket_limits_(num_outputs:int, full_range:tuple=None, ys:torch.Tensor=None, verbose:bool=False):
+    assert (ys is not None) or (full_range is not None)
+    if ys is not None:
+        ys = ys.flatten()
+        if len(ys) % num_outputs: ys = ys[:-(len(ys) % num_outputs)]
+        print(f'Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys.')
+        ys_per_bucket = len(ys) // num_outputs
+        if full_range is None:
+            full_range = (ys.min(), ys.max())
+        else:
+            assert full_range[0] <= ys.min() and full_range[1] >= ys.max()
+            full_range = torch.tensor(full_range)
+        ys_sorted, ys_order = ys.sort(0)
+        bucket_limits = (ys_sorted[ys_per_bucket-1::ys_per_bucket][:-1]+ys_sorted[ys_per_bucket::ys_per_bucket])/2
+        if verbose:
+            print(f'Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys.')
+            print(full_range)
+        bucket_limits = torch.cat([full_range[0].unsqueeze(0), bucket_limits, full_range[1].unsqueeze(0)],0)
+    else:
+        class_width = (full_range[1] - full_range[0]) / num_outputs
+        bucket_limits = torch.cat([full_range[0] + torch.arange(num_outputs).float()*class_width, torch.tensor(full_range[1]).unsqueeze(0)], 0)
+    assert len(bucket_limits) - 1 == num_outputs and full_range[0] == bucket_limits[0] and full_range[-1] == bucket_limits[-1]
+    return bucket_limits
+def get_bucket_limits(
+    num_outputs: int,
+    full_range: tuple = None,
+    ys: torch.Tensor = None,
+    verbose: bool = False,
+):
+    assert (ys is None) != (
+        full_range is None
+    ), "Either full_range or ys must be passed."
+    if ys is not None:
+        ys = ys.flatten()
+        ys = ys[~torch.isnan(ys)]
+        if len(ys) % num_outputs:
+            ys = ys[: -(len(ys) % num_outputs)]
+        print(
+            f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
+        )
+        ys_per_bucket = len(ys) // num_outputs
+        if full_range is None:
+            full_range = (ys.min(), ys.max())
+        else:
+            assert (
+                full_range[0] <= ys.min() and full_range[1] >= ys.max()
+            ), f"full_range {full_range} not in range of ys {ys.min(), ys.max()}"
+            full_range = torch.tensor(full_range)
+        ys_sorted, ys_order = ys.sort(0)
+        bucket_limits = (
+            ys_sorted[ys_per_bucket - 1 :: ys_per_bucket][:-1]
+            + ys_sorted[ys_per_bucket::ys_per_bucket]
+        ) / 2
+        if verbose:
+            print(
+                f"Using {len(ys)} y evals to estimate {num_outputs} buckets. Cut off the last {len(ys) % num_outputs} ys."
+            )
+            print(full_range)
+        bucket_limits = torch.cat(
+            [full_range[0].unsqueeze(0), bucket_limits, full_range[1].unsqueeze(0)], 0
+        )
+    else:
+        class_width = (full_range[1] - full_range[0]) / num_outputs
+        bucket_limits = torch.cat(
+            [
+                full_range[0] + torch.arange(num_outputs).float() * class_width,
+                torch.tensor(full_range[1]).unsqueeze(0),
+            ],
+            0,
+        )
+    assert (
+        len(bucket_limits) - 1 == num_outputs
+    ), f"len(bucket_limits) - 1 == {len(bucket_limits) - 1} != {num_outputs} == num_outputs"
+    assert full_range[0] == bucket_limits[0], f"{full_range[0]} != {bucket_limits[0]}"
+    assert (
+        full_range[-1] == bucket_limits[-1]
+    ), f"{full_range[-1]} != {bucket_limits[-1]}"
+    return bucket_limits

lcpfn/curves.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import numpy as np
+from collections import OrderedDict
+prior = {
+    "pow3": {
+        "uniform": OrderedDict(
+            a={"type": "uniform", "param1": -1, "param2": 1},
+            c={"type": "uniform", "param1": 0, "param2": 1},
+            alpha={"type": "uniform", "param1": 0, "param2": 1},
+        ),
+        "peaked": OrderedDict(
+            a={"type": "uniform", "param1": -0.6, "param2": 0.6},
+            c={"type": "uniform", "param1": 0, "param2": 1.25},
+            alpha={"type": "log_normal", "param1": 0, "param2": 2},
+        ),
+    },
+    "ilog2": {
+        "uniform": OrderedDict(
+            c={"type": "uniform", "param1": 0, "param2": 1},
+            a={"type": "uniform", "param1": -1, "param2": 1},
+        ),
+        "peaked": OrderedDict(
+            c={"type": "uniform", "param1": 0, "param2": 1},
+            a={"type": "uniform", "param1": -0.5, "param2": 0.5},
+        ),
+    },
+    "janoschek": {
+        "uniform": OrderedDict(
+            a={"type": "uniform", "param1": 0, "param2": 1},
+            beta={"type": "uniform", "param1": 0, "param2": 2},
+            k={"type": "uniform", "param1": 0, "param2": 1},
+            delta={"type": "uniform", "param1": -5, "param2": 5},
+        ),
+        "peaked": OrderedDict(
+            a={"type": "uniform", "param1": 0, "param2": 1},
+            beta={"type": "uniform", "param1": 0, "param2": 2},
+            k={"type": "log_normal", "param1": -2, "param2": 1},
+            delta={"type": "log_normal", "param1": 0, "param2": 0.5},
+        ),
+    },
+}
+def prior_sampler(rng, type, param1, param2):
+    if type == "uniform":
+        return rng.uniform(param1, param2)
+    elif type == "log_normal":
+        return rng.lognormal(param1, param2)
+    raise Exception("Unknown prior type: {}".format(type))
+def pow3(x, c, a, alpha):
+    return c - a * (x) ** (-alpha)
+def prior_pow3(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["pow3"]["peaked"][p]["type"],
+            param1=prior["pow3"]["peaked"][p]["param1"],
+            param2=prior["pow3"]["peaked"][p]["param2"],
+        )
+        for p in ["a", "c", "alpha"]
+    }
+def uniform_prior_pow3(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["pow3"]["uniform"][p]["type"],
+            param1=prior["pow3"]["uniform"][p]["param1"],
+            param2=prior["pow3"]["uniform"][p]["param2"],
+        )
+        for p in ["a", "c", "alpha"]
+    }
+def ilog2(x, c, a):
+    return c - a / (np.log(x + 1))
+def prior_ilog2(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["ilog2"]["peaked"][p]["type"],
+            param1=prior["ilog2"]["peaked"][p]["param1"],
+            param2=prior["ilog2"]["peaked"][p]["param2"],
+        )
+        for p in ["a", "c"]
+    }
+def uniform_prior_ilog2(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["ilog2"]["uniform"][p]["type"],
+            param1=prior["ilog2"]["uniform"][p]["param1"],
+            param2=prior["ilog2"]["uniform"][p]["param2"],
+        )
+        for p in ["a", "c"]
+    }
+def janoschek(x, a, beta, k, delta):
+    """
+    http://www.pisces-conservation.com/growthhelp/janoschek.htm
+    """
+    return a - (a - beta) * np.exp(-k * x**delta)
+def prior_janoschek(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["janoschek"]["peaked"][p]["type"],
+            param1=prior["janoschek"]["peaked"][p]["param1"],
+            param2=prior["janoschek"]["peaked"][p]["param2"],
+        )
+        for p in ["a", "beta", "k", "delta"]
+    }
+def uniform_prior_janoschek(rng):
+    return {
+        p: prior_sampler(
+            rng,
+            prior["janoschek"]["uniform"][p]["type"],
+            param1=prior["janoschek"]["uniform"][p]["param1"],
+            param2=prior["janoschek"]["uniform"][p]["param2"],
+        )
+        for p in ["a", "beta", "k", "delta"]
+    }
+def log_power(x, a, b, c):
+    # a: upper bound
+    # c: growth rate
+    # initial = a/ (1 + (1/e^b)^c
+    return a / (1.0 + (x / np.exp(b)) ** c)
+def prior_log_power(rng):
+    # a ~ N(0.8,0.1)
+    # b ~ N(1,1)
+    # c ~ U(-3,0)
+    a = rng.normal(0.8, 0.1)
+    b = rng.normal(1.0, 1.0)
+    c = rng.uniform(-3.0, 0.0)
+    return {"a": a, "b": b, "c": c}
+def weibull(x, alpha, beta, kappa, delta):
+    """
+    Weibull modell
+    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
+    alpha: upper asymptote
+    beta: lower asymptote
+    k: growth rate
+    delta: controls the x-ordinate for the point of inflection
+    """
+    return alpha - (alpha - beta) * np.exp(-((kappa * x) ** delta))
+def prior_weibull(rng):
+    alpha = rng.uniform(0.0, 1.5)
+    beta = rng.uniform(0.0, 1)
+    kappa = np.exp(rng.normal(-2.0, 1.0))
+    delta = np.exp(rng.normal(0, 0.5))
+    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
+def mmf(x, alpha, beta, kappa, delta):
+    """
+    Morgan-Mercer-Flodin
+    description:
+    Nonlinear Regression page 342
+    http://bit.ly/1jodG17
+    http://www.pisces-conservation.com/growthhelp/index.html?morgan_mercer_floden.htm
+    alpha: upper asymptote
+    kappa: growth rate
+    beta: initial value
+    delta: controls the point of inflection
+    """
+    return alpha - (alpha - beta) / (1.0 + (kappa * x) ** delta)
+def prior_mmf(rng):
+    # alpha ~ N(0.8,0.1)
+    # beta ~ N(0.2,0.1)
+    # ln(kappa) ~ N(0,2)
+    # ln(delta) ~ N(0,1)
+    alpha = rng.normal(0.8, 0.1)
+    beta = rng.normal(0.2, 0.1)
+    kappa = np.exp(rng.normal(0, 2))
+    delta = np.exp(rng.normal(0, 1))
+    return {"alpha": alpha, "beta": beta, "kappa": kappa, "delta": delta}
+def vap(x, a, b, c):
+    """Vapor pressure model"""
+    # no upper bound if c > 0
+    # a = ln(upper bound) for c=0
+    # a+b = ln(initial)
+    return np.exp(a + b / x + c * np.log(x))
+def prior_vap(rng):
+    a = rng.uniform(-2.0, 0.0)  # @heri: range check
+    b = rng.uniform(-4.0, 0.0)  # @heri: range check
+    c = np.exp(rng.uniform(-8.0, 0.0))  # @heri: same as weights
+    return {"a": a, "b": b, "c": c}
+def loglog_linear(x, a, b):
+    x = np.log(x)
+    return np.log(a * x + b)
+def prior_loglog_linear(rng):
+    # ln(a) ~ N(-2, 1)
+    # ln(b) ~ U(0, 1)
+    a = np.exp(rng.normal(-2.0, 1.0))
+    b = np.exp(rng.uniform(0.0, 1.0))
+    return {"a": a, "b": b}
+def exp4(x, c, a, b, alpha):
+    return c - np.exp(-a * (x**alpha) + b)
+def prior_exp4(rng):
+    # c ~ N(0.8,0.1)
+    c = rng.normal(0.8, 0.1)
+    # ln(a) ~ N(-2,1)
+    a = np.exp(rng.normal(-2, 1))
+    # ln(alpha) ~ N(0,1)
+    alpha = np.exp(rng.normal(0, 1))
+    # ln(b) ~ N(0,0.5)
+    b = np.exp(rng.normal(0, 0.5))
+    return {"a": a, "b": b, "c": c, "alpha": alpha}
+def pow4(x, c, a, b, alpha):
+    return c - (a * x + b) ** -alpha
+def prior_pow4(rng):
+    # ln(1 - c) ~ U(-5, 0)
+    c = 1 - np.exp(rng.uniform(-5.0, 0))
+    # ln(a) ~ N(-3, 2)
+    a = np.exp(rng.normal(-3.0, 2))
+    # ln(alpha) ~ N(0,1)
+    alpha = np.exp(rng.normal(0, 1))
+    # ln(b) ~ U(0, 1)
+    b = np.exp(rng.uniform(0, 1))
+    return {"a": a, "b": b, "c": c, "alpha": alpha}
+def dr_hill_zero_background(x, theta, eta, kappa):
+    # theta: upper bound
+    # eta: growth rate
+    # initial = theta/(kappa^eta + 1)
+    return (theta * x**eta) / (kappa**eta + x**eta)
+def prior_dr_hill_zero_background(rng):
+    # theta ~ U(1,0) N(0.8,0.1)
+    # ln(eta) ~ N(1,1)
+    # ln(kappa) ~ N(1,2)
+    theta = rng.normal(0.8, 0.1)
+    eta = np.exp(rng.normal(1.0, 1.0))
+    kappa = np.exp(rng.normal(1.0, 2.0))
+    return {"theta": theta, "eta": eta, "kappa": kappa}

lcpfn/decoders.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from torch import nn
+import random
+class ScaledDecoder(nn.Module):
+    def __init__(self, ninp, nhid, nout):
+        super().__init__()
+        self.linear = nn.Linear(ninp, nhid)
+        self.linear1 = nn.Linear(nhid, nout)
+        self.linear2 = nn.Linear(nhid, 10)
+    def forward(self, x):
+        #return torch.cat([self.linear1(x), self.linear2(x)], -1)
+        x = self.linear(x)
+        x = nn.GELU()(x)
+        temps = self.linear2(x).softmax(-1) @ torch.tensor([1.,1.4,1.7,2.,5.,10.,20.,40.,80.,160.], device=x.device)
+        if random.random() > .99:
+            print(temps.shape,temps[:,:2])
+        return self.linear1(x) / temps.unsqueeze(-1)
+class FixedScaledDecoder(nn.Module):
+    def __init__(self, ninp, nhid, nout):
+        super().__init__()
+        self.mapper = nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, nout))
+        self.T = nn.Parameter(torch.ones(10000)/10000)
+    def forward(self, x):
+        return self.mapper(x)/self.T.sum()

lcpfn/domhan_prior.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from functools import partial
+import torch
+import numpy as np
+from lcpfn.curves import (
+    pow3,
+    ilog2,
+    janoschek,
+    log_power,
+    prior_ilog2,
+    uniform_prior_pow3,
+    weibull,
+    mmf,
+    vap,
+    loglog_linear,
+    exp4,
+    pow4,
+    dr_hill_zero_background,
+)
+from lcpfn.curves import (
+    prior_pow3,
+    prior_janoschek,
+    prior_log_power,
+    prior_weibull,
+    prior_mmf,
+    prior_vap,
+    prior_loglog_linear,
+    prior_exp4,
+    prior_pow4,
+    prior_dr_hill_zero_background,
+)
+from lcpfn.curves import (
+    uniform_prior_pow3,
+    uniform_prior_ilog2,
+    uniform_prior_janoschek,
+)
+def prior_weights(
+    rng,
+    components=[
+        "pow3",
+        "ilog2",
+        "janoschek",
+        "log_power",
+        "weibull",
+        "mmf",
+        "vap",
+        "loglog_linear",
+        "exp4",
+        "pow4",
+        "dr_hill_zero_background",
+    ],
+):
+    K = len(components)
+    weights = rng.uniform(0.0, 1, size=(K,))
+    return {f: weights[i] for i, f in enumerate(components)}
+def sample_from_prior(rng, seq_len=100):
+    return sample_prior_comb(
+        rng=rng, seq_len=seq_len, components=["pow3", "ilog2", "janoschek"], distribution="peaked"
+    )
+def sample_prior_comb(
+    rng,
+    components,
+    distribution,
+    var_lnloc=-4,
+    var_lnscale=1,
+    range_constraint=True,
+    seq_len=100,
+):
+    f_components = {
+        "pow3": pow3,
+        "ilog2": ilog2,
+        "janoschek": janoschek,
+        "log_power": log_power,
+        "weibull": weibull,
+        "mmf": mmf,
+        "vap": vap,
+        "loglog_linear": loglog_linear,
+        "exp4": exp4,
+        "pow4": pow4,
+        "dr_hill_zero_background": dr_hill_zero_background,
+    }
+    if distribution == "peaked":
+        f_priors = {
+            "pow3": prior_pow3,
+            "ilog2": prior_ilog2,
+            "janoschek": prior_janoschek,
+            "log_power": prior_log_power,
+            "weibull": prior_weibull,
+            "mmf": prior_mmf,
+            "vap": prior_vap,
+            "loglog_linear": prior_loglog_linear,
+            "exp4": prior_exp4,
+            "pow4": prior_pow4,
+            "dr_hill_zero_background": prior_dr_hill_zero_background,
+        }
+    elif distribution == "uniform":
+        f_priors = {
+            "pow3": uniform_prior_pow3,
+            "ilog2": uniform_prior_ilog2,
+            "janoschek": uniform_prior_janoschek
+        }
+    else:
+        raise NotImplemented()
+    x = np.arange(1, seq_len + 1)
+    while True:
+        # sample the noiseless curve
+        weights = prior_weights(rng, components=components)
+        y = np.zeros(x.shape, dtype="float")
+        kwargs = 0
+        for f, w in weights.items():
+            kwargs = f_priors[f](rng)
+            # print(f_components[f](x, **kwargs))
+            y += w * f_components[f](x, **kwargs)
+        # add noise (can exceed [0,1], but afaik no way to implement this prior in Tobis work)
+        var = np.exp(
+            rng.normal(var_lnloc, var_lnscale)
+        )  # @heri: ln_prob =+ log(normal.pdf(log(var), loc=var_lnloc, scale=var_lnscale))
+        # reject any curves that are non-increasing, exceed the [0,1] range
+        if (
+            y[-1] <= y[0]
+            or (range_constraint and (np.any(y < 0) or np.any(y > 1)))
+            or np.isnan(y).any()
+        ):
+            continue
+        else:
+            break
+    def curve():  # generates a sample from the same model, but with independent noise
+        y_noisy = y + rng.normal(np.zeros_like(y), var)
+        return y, y_noisy
+    return curve
+def generate_prior_dataset(n, prior=sample_prior_comb, seed=42):
+    """
+    Returns a fixed sample from the prior (with fixed seq_len) as an n x seq_len np.ndarray
+    """
+    rng = np.random.RandomState(seed)
+    prior_data = np.stack([prior(rng)()[1] for _ in range(n)])
+    return prior_data
+def create_get_batch_func(prior):
+    return partial(get_batch_domhan, prior=prior)
+# function producing batches for PFN training
+def get_batch_domhan(
+    batch_size,
+    seq_len,
+    num_features,
+    prior,
+    device="cpu",
+    noisy_target=True,
+    **_,
+):
+    assert num_features == 1
+    x = np.arange(1, seq_len + 1)
+    y_target = np.empty((batch_size, seq_len), dtype=float)
+    y_noisy = np.empty((batch_size, seq_len), dtype=float)
+    for i in range(batch_size):
+        curve_func = prior(np.random, seq_len=seq_len)  # uses numpy rng
+        if noisy_target:
+            _, y_noisy[i] = curve_func()
+            y_target[i] = y_noisy[i]
+        else:
+            y_target[i], y_noisy[i] = curve_func()
+    # turn numpy arrays into correctly shaped torch tensors & move them to device
+    x = (
+        torch.arange(1, seq_len + 1)
+        .repeat((num_features, batch_size, 1))
+        .transpose(2, 0)
+        .to(device)
+    )
+    y_target = torch.from_numpy(y_target).transpose(1, 0).to(device)
+    y_noisy = torch.from_numpy(y_noisy).transpose(1, 0).to(device)
+    # changes
+    x = x.float()
+    y_target = y_target.float()
+    y_noisy = y_noisy.float()
+    return x, y_noisy, y_target

lcpfn/encoders.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import math
+import torch
+import torch.nn as nn
+from lcpfn.utils import normalize_data
+import torch.nn.functional as F
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+class StyleEncoder(nn.Module):
+    def __init__(self, em_size, hyperparameter_definitions):
+        super().__init__()
+        self.em_size = em_size
+        self.embedding = nn.Linear(hyperparameter_definitions.shape[0], self.em_size)
+    def forward(self, hyperparameters):  # T x B x num_hps
+        return self.embedding(hyperparameters)
+class _PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        self.d_model = d_model
+        self.device_test_tensor = nn.Parameter(torch.tensor(1.))
+    def forward(self, x):# T x B x num_features
+        assert self.d_model % x.shape[-1]*2 == 0
+        d_per_feature = self.d_model // x.shape[-1]
+        pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
+        #position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        interval_size = 10
+        div_term = (1./interval_size) * 2*math.pi*torch.exp(torch.arange(0, d_per_feature, 2, device=self.device_test_tensor.device).float()*math.log(math.sqrt(2)))
+        #print(div_term/2/math.pi)
+        pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
+        pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
+        return self.dropout(pe).view(x.shape[0],x.shape[1],self.d_model)
+Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)
+class EmbeddingEncoder(nn.Module):
+    def __init__(self, num_features, em_size, num_embs=100):
+        super().__init__()
+        self.num_embs = num_embs
+        self.embeddings = nn.Embedding(num_embs * num_features, em_size, max_norm=True)
+        self.init_weights(.1)
+        self.min_max = (-2,+2)
+    @property
+    def width(self):
+        return self.min_max[1] - self.min_max[0]
+    def init_weights(self, initrange):
+        self.embeddings.weight.data.uniform_(-initrange, initrange)
+    def discretize(self, x):
+        split_size = self.width / self.num_embs
+        return (x - self.min_max[0] // split_size).int().clamp(0, self.num_embs - 1)
+    def forward(self, x):  # T x B x num_features
+        x_idxs = self.discretize(x)
+        x_idxs += torch.arange(x.shape[-1], device=x.device).view(1, 1, -1) * self.num_embs
+        # print(x_idxs,self.embeddings.weight.shape)
+        return self.embeddings(x_idxs).mean(-2)
+class Normalize(nn.Module):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+    def forward(self, x):
+        return (x-self.mean)/self.std
+def get_normalized_uniform_encoder(encoder_creator):
+    """
+    This can be used to wrap an encoder that is fed uniform samples in [0,1] and normalizes these to 0 mean and 1 std.
+    For example, it can be used as `encoder_creator = get_normalized_uniform_encoder(encoders.Linear)`, now this can
+    be initialized with `encoder_creator(feature_dim, in_dim)`.
+    :param encoder:
+    :return:
+    """
+    return lambda in_dim, out_dim: nn.Sequential(Normalize(.5, math.sqrt(1/12)), encoder_creator(in_dim, out_dim))
+Linear = nn.Linear
+MLP = lambda num_features, emsize: nn.Sequential(nn.Linear(num_features+1,emsize*2),
+                                                 nn.ReLU(),
+                                                 nn.Linear(emsize*2,emsize))
+class NanHandlingEncoder(nn.Module):
+    def __init__(self, num_features, emsize, keep_nans=True):
+        super().__init__()
+        self.num_features = 2 * num_features if keep_nans else num_features
+        self.emsize = emsize
+        self.keep_nans = keep_nans
+        self.layer = nn.Linear(self.num_features, self.emsize)
+    def forward(self, x):
+        if self.keep_nans:
+            x = torch.cat([torch.nan_to_num(x, nan=0.0), normalize_data(torch.isnan(x) * -1
+                                                          + torch.logical_and(torch.isinf(x), torch.sign(x) == 1) * 1
+                                                          + torch.logical_and(torch.isinf(x), torch.sign(x) == -1) * 2
+                                                          )], -1)
+        else:
+            x = torch.nan_to_num(x, nan=0.0)
+        return self.layer(x)
+class Linear(nn.Linear):
+    def __init__(self, num_features, emsize):
+        super().__init__(num_features, emsize)
+        self.num_features = num_features
+        self.emsize = emsize
+    def forward(self, x):
+        x = torch.nan_to_num(x, nan=0.0)
+        return super().forward(x)
+class Conv(nn.Module):
+    def __init__(self, input_size, emsize):
+        super().__init__()
+        self.convs = torch.nn.ModuleList([nn.Conv2d(64 if i else 1, 64, 3) for i in range(5)])
+        self.linear = nn.Linear(64,emsize)
+    def forward(self, x):
+        size = math.isqrt(x.shape[-1])
+        assert size*size == x.shape[-1]
+        x = x.reshape(*x.shape[:-1], 1, size, size)
+        for conv in self.convs:
+            if x.shape[-1] < 4:
+                break
+            x = conv(x)
+            x.relu_()
+        x = nn.AdaptiveAvgPool2d((1,1))(x).squeeze(-1).squeeze(-1)
+        return self.linear(x)
+class CanEmb(nn.Embedding):
+    def __init__(self, num_features, num_embeddings: int, embedding_dim: int, *args, **kwargs):
+        assert embedding_dim % num_features == 0
+        embedding_dim = embedding_dim // num_features
+        super().__init__(num_embeddings, embedding_dim, *args, **kwargs)
+    def forward(self, x):
+        lx = x.long()
+        assert (lx == x).all(), "CanEmb only works with tensors of whole numbers"
+        x = super().forward(lx)
+        return x.view(*x.shape[:-2], -1)
+def get_Canonical(num_classes):
+    return lambda num_features, emsize: CanEmb(num_features, num_classes, emsize)
+def get_Embedding(num_embs_per_feature=100):
+    return lambda num_features, emsize: EmbeddingEncoder(num_features, emsize, num_embs=num_embs_per_feature)

lcpfn/initializers.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from torch import nn
+def get_NormalInitializer(std):
+    def initializer(m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0, std)
+            nn.init.normal_(m.bias, 0, std)
+    return initializer

lcpfn/layer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from functools import partial
+from typing import Optional
+from torch import Tensor
+from torch import nn
+from torch.nn.modules.transformer import *
+from torch.nn.modules.transformer import _get_activation_fn
+from torch.utils.checkpoint import checkpoint
+class TransformerEncoderLayer(nn.Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+    This standard encoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of intermediate layer, relu or gelu (default=relu).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``.
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+    """
+    __constants__ = ['batch_first']
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
+                 layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
+                 device=None, dtype=None, recompute_attn=False) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                            **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.pre_norm = pre_norm
+        self.recompute_attn = recompute_attn
+        self.activation = _get_activation_fn(activation)
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super().__setstate__(state)
+    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        if self.pre_norm:
+            src_ = self.norm1(src)
+        else:
+            src_ = src
+        if isinstance(src_mask, tuple):
+            # global attention setup
+            assert not self.self_attn.batch_first
+            assert src_key_padding_mask is None
+            global_src_mask, trainset_src_mask, valset_src_mask = src_mask
+            num_global_tokens = global_src_mask.shape[0]
+            num_train_tokens = trainset_src_mask.shape[0]
+            global_tokens_src = src_[:num_global_tokens]
+            train_tokens_src = src_[num_global_tokens:num_global_tokens+num_train_tokens]
+            global_and_train_tokens_src = src_[:num_global_tokens+num_train_tokens]
+            eval_tokens_src = src_[num_global_tokens+num_train_tokens:]
+            attn = partial(checkpoint, self.self_attn) if self.recompute_attn else self.self_attn
+            global_tokens_src2 = attn(global_tokens_src, global_and_train_tokens_src, global_and_train_tokens_src, None, True, global_src_mask)[0]
+            train_tokens_src2 = attn(train_tokens_src, global_tokens_src, global_tokens_src, None, True, trainset_src_mask)[0]
+            eval_tokens_src2 = attn(eval_tokens_src, src_, src_,
+                                    None, True, valset_src_mask)[0]
+            src2 = torch.cat([global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0)
+        else:
+            if self.recompute_attn:
+                src2 = checkpoint(self.self_attn, src_, src_, src_, src_key_padding_mask, True, src_mask)[0]
+            else:
+                src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask,
+                                      key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        if not self.pre_norm:
+            src = self.norm1(src)
+        if self.pre_norm:
+            src_ = self.norm2(src)
+        else:
+            src_ = src
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src_))))
+        src = src + self.dropout2(src2)
+        if not self.pre_norm:
+            src = self.norm2(src)
+        return src

lcpfn/model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import lcpfn
+class LCPFN(torch.nn.Module):
+    def __init__(self, model_name="EMSIZE512_NLAYERS12_NBUCKETS1000"):
+        super(LCPFN, self).__init__()
+        self.model = torch.load(getattr(lcpfn, model_name) if model_name in lcpfn.model_dict else model_name)
+        self.model.eval()
+    @torch.no_grad()
+    def predict_mean(self, x_train, y_train, x_test):
+        logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
+        return self.model.criterion.mean(logits)
+    @torch.no_grad()
+    def predict_quantiles(self, x_train, y_train, x_test, qs):
+        logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
+        return torch.cat([self.model.criterion.icdf(logits, q) for q in qs], dim=1)
+    @torch.no_grad()
+    def nll_loss(self, x_train, y_train, x_test, y_test):
+        logits = self(x_train=x_train, y_train=y_train, x_test=x_test)
+        return self.model.criterion(logits, y_test)
+    def forward(self, x_train, y_train, x_test):
+        single_eval_pos = x_train.shape[0]
+        x = torch.cat([x_train, x_test], dim=0).unsqueeze(1)
+        y = y_train.unsqueeze(1)
+        return self.model((x, y), single_eval_pos=single_eval_pos)

lcpfn/positional_encodings.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import math
+import torch
+from torch import nn
+# Protocol for positonal encodings.
+# __init__(d_model, max_len=..[, more optionals])
+# forward(x: (seq_len, bs, d_model)) -> Tensor of shape (*x.shape[:2],d_model) containing pos. embeddings
+class NoPositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=None):
+        super(NoPositionalEncoding, self).__init__()
+        pass
+    def forward(self, x):
+        return x #* math.sqrt(x.shape[-1])
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = self.pe[:x.size(0), :] + x # * math.sqrt(x.shape[-1])
+        return x
+class LearnedPositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(LearnedPositionalEncoding, self).__init__()
+        self.max_seq_len = max_len
+        #self.positional_embeddings = nn.Embedding(max_len, d_model)
+        self.positional_embeddings = nn.Parameter(torch.empty(max_len, d_model))
+        nn.init.normal_(self.positional_embeddings, mean=0, std=d_model ** -0.5)
+    def forward(self, x):
+        seq_len, bs, d_model = x.shape
+        assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
+        pos_emb = self.positional_embeddings[:seq_len]
+        return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])
+class PairedScrambledPositionalEncodings(LearnedPositionalEncoding):
+    # TODO check whether it is a problem to use the same perm. for full batch
+    def forward(self, x):
+        seq_len, bs, d_model = x.shape
+        assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
+        assert len(self.positional_embeddings) % 2 == 0, 'Please specify an even max_len.'
+        paired_embs = self.positional_embeddings.view(len(self.positional_embeddings), -1, 2)
+        pos_emb = paired_embs[torch.randperm(len(paired_embs))].view(*self.positional_embeddings.shape)[:seq_len]
+        return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])

lcpfn/priors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import gp, ridge

lcpfn/priors/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (216 Bytes). View file

lcpfn/priors/__pycache__/gp.cpython-310.pyc ADDED Viewed

Binary file (2.17 kB). View file

lcpfn/priors/__pycache__/prior.cpython-310.pyc ADDED Viewed

Binary file (1.11 kB). View file

lcpfn/priors/__pycache__/ridge.cpython-310.pyc ADDED Viewed

Binary file (1.44 kB). View file

lcpfn/priors/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (6.26 kB). View file

lcpfn/priors/binarized_regression.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from . import fast_gp, fast_gp_mix
+from .utils import get_batch_to_dataloader
+def regression_prior_to_binary(get_batch_function):
+    def binarized_get_batch_function(*args, assert_on=False, **kwargs):
+        x, y, target_y = get_batch_function(*args, **kwargs)
+        if assert_on:
+            assert y is target_y, "y == target_y is assumed by this function"
+        y = y.sigmoid().bernoulli()
+        return x, y, y
+    return binarized_get_batch_function
+Binarized_fast_gp_dataloader = get_batch_to_dataloader(regression_prior_to_binary(fast_gp.get_batch))
+Binarized_fast_gp_mix_dataloader = get_batch_to_dataloader(regression_prior_to_binary(fast_gp_mix.get_batch))

lcpfn/priors/fast_gp.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import time
+import torch
+from torch import nn
+import gpytorch
+from .utils import get_batch_to_dataloader
+from utils import default_device
+# We will use the simplest form of GP model, exact inference
+class ExactGPModel(gpytorch.models.ExactGP):
+    def __init__(self, train_x, train_y, likelihood):
+        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
+        self.mean_module = gpytorch.means.ConstantMean()
+        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
+    def forward(self, x):
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+def get_model(x, y, hyperparameters):
+    likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.GreaterThan(1.e-9))
+    model = ExactGPModel(x, y, likelihood)
+    model.likelihood.noise = torch.ones_like(model.likelihood.noise) * hyperparameters["noise"]
+    model.covar_module.outputscale = torch.ones_like(model.covar_module.outputscale) * hyperparameters["outputscale"]
+    model.covar_module.base_kernel.lengthscale = torch.ones_like(model.covar_module.base_kernel.lengthscale) * \
+                                                 hyperparameters["lengthscale"]
+    return model, likelihood
+@torch.no_grad()
+def get_batch(batch_size, seq_len, num_features, device=default_device, hyperparameters=None,
+              equidistant_x=False, fix_x=None, **kwargs):
+    if isinstance(hyperparameters, (tuple, list)):
+        hyperparameters = {"noise": hyperparameters[0]
+            , "outputscale": hyperparameters[1]
+            , "lengthscale": hyperparameters[2]
+            , "is_binary_classification": hyperparameters[3]
+            # , "num_features_used": hyperparameters[4]
+            , "normalize_by_used_features": hyperparameters[5]
+            , "order_y": hyperparameters[6]
+            , "sampling": hyperparameters[7]
+                           }
+    elif hyperparameters is None:
+        hyperparameters = {"noise": .1, "outputscale": .1, "lengthscale": .1}
+    if 'verbose' in hyperparameters and hyperparameters['verbose']:
+        print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
+                  , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size, 'sampling': hyperparameters['sampling']})
+    # hyperparameters = {k: hyperparameters[k]() if callable(hyperparameters[k]) else hyperparameters[k] for k in
+    #      hyperparameters.keys()}
+    assert not (equidistant_x and (fix_x is not None))
+    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations', (True, True, True))):
+        if equidistant_x:
+            assert num_features == 1
+            x = torch.linspace(0, 1., seq_len).unsqueeze(0).repeat(batch_size, 1).unsqueeze(-1)
+        elif fix_x is not None:
+            assert fix_x.shape == (seq_len, num_features)
+            x = fix_x.unsqueeze(0).repeat(batch_size, 1, 1).to(device)
+        else:
+            if hyperparameters.get('sampling','uniform') == 'uniform':
+                x = torch.rand(batch_size, seq_len, num_features, device=device)
+            else:
+                x = torch.randn(batch_size, seq_len, num_features, device=device)
+        model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
+        model.to(device)
+        # trained_model = ExactGPModel(train_x, train_y, likelihood).cuda()
+        # trained_model.eval()
+        successful_sample = False
+        while not successful_sample:
+            try:
+                with gpytorch.settings.prior_mode(True):
+                    model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
+                    model.to(device)
+                    d = model(x)
+                    sample_wo_noise = d.sample().transpose(0, 1)  # this will be the target for the loss
+                    sample = likelihood(sample_wo_noise).sample()  # this will be the input to the Transformer
+                    successful_sample = True
+            except RuntimeError: # This can happen when torch.linalg.eigh fails. Restart with new init resolves this.
+                print('GP Sampling unsuccessful, retrying.. ')
+                print(x)
+                print(hyperparameters)
+    if bool(torch.any(torch.isnan(x)).detach().cpu().numpy()):
+        print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
+                  , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size})
+    # TODO: Multi output
+    return x.transpose(0, 1), sample, sample if hyperparameters.get("observation_noise", True) else sample_wo_noise
+DataLoader = get_batch_to_dataloader(get_batch)
+def get_model_on_device(x,y,hyperparameters,device):
+    model, likelihood = get_model(x, y, hyperparameters)
+    model.to(device)
+    return model, likelihood
+@torch.no_grad()
+def evaluate(x, y, y_non_noisy, use_mse=False, hyperparameters={}, get_model_on_device=get_model_on_device, device=default_device, step_size=1, start_pos=0):
+    start_time = time.time()
+    losses_after_t = [.0] if start_pos == 0 else []
+    all_losses_after_t = []
+    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))), gpytorch.settings.fast_pred_var(False):
+        for t in range(max(start_pos, 1), len(x), step_size):
+            loss_sum = 0.
+            model, likelihood = get_model_on_device(x[:t].transpose(0, 1), y[:t].transpose(0, 1), hyperparameters, device)
+            model.eval()
+            # print([t.shape for t in model.train_inputs])
+            # print(x[:t].transpose(0,1).shape, x[t].unsqueeze(1).shape, y[:t].transpose(0,1).shape)
+            f = model(x[t].unsqueeze(1))
+            l = likelihood(f)
+            means = l.mean.squeeze()
+            varis = l.covariance_matrix.squeeze()
+            # print(l.variance.squeeze(), l.mean.squeeze(), y[t])
+            assert len(means.shape) == len(varis.shape) == 1
+            assert len(means) == len(varis) == x.shape[1]
+            if use_mse:
+                c = nn.MSELoss(reduction='none')
+                ls = c(means, y[t])
+            else:
+                ls = -l.log_prob(y[t].unsqueeze(1))
+            losses_after_t.append(ls.mean())
+            all_losses_after_t.append(ls.flatten())
+        return torch.stack(all_losses_after_t).to('cpu'), torch.tensor(losses_after_t).to('cpu'), time.time() - start_time
+if __name__ == '__main__':
+    hps = (.1,.1,.1)
+    for redo_idx in range(1):
+        print(
+            evaluate(*get_batch(1000, 10, hyperparameters=hps, num_features=10), use_mse=False, hyperparameters=hps))

lcpfn/priors/fast_gp_mix.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import time
+import functools
+import random
+import math
+import traceback
+import numpy as np
+import torch
+from torch import nn
+import gpytorch
+from botorch.models import SingleTaskGP
+from botorch.models.gp_regression import MIN_INFERRED_NOISE_LEVEL
+from botorch.fit import fit_gpytorch_model
+from gpytorch.mlls import ExactMarginalLogLikelihood
+from gpytorch.likelihoods import GaussianLikelihood
+from gpytorch.priors.torch_priors import GammaPrior, UniformPrior
+from gpytorch.constraints import GreaterThan
+from bar_distribution import BarDistribution
+from utils import default_device
+from .utils import get_batch_to_dataloader
+from . import fast_gp
+def get_model(x, y, hyperparameters: dict, sample=True):
+    if hyperparameters.get('handmade', False):
+        # We will use the simplest form of GP model, exact inference
+        class ExactGPModel(gpytorch.models.ExactGP):
+            def __init__(self, train_x, train_y, likelihood):
+                super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
+                self.mean_module = gpytorch.means.ConstantMean()
+                self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel())
+                self.mean_module.register_prior("mean_prior", UniformPrior(-1, 1), "constant")
+                self.covar_module.base_kernel.register_prior("lengthscale_prior", UniformPrior(0.01, 0.5),
+                                                              "lengthscale")
+                # model.covar_module.base_kernel.register_prior("period_length_prior", UniformPrior(0.05, 2.5), "period_length")
+                self.covar_module.register_prior("outputscale_prior", UniformPrior(1, 2), "outputscale")
+                likelihood.register_prior("noise_prior", UniformPrior(0.001, 0.01), "noise")
+                self.to(x)
+            def forward(self, x):
+                mean_x = self.mean_module(x)
+                covar_x = self.covar_module(x)
+                return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+        likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.Positive())
+        model = ExactGPModel(x, y, likelihood)
+    else:
+        aug_batch_shape = SingleTaskGP(x,y.unsqueeze(-1))._aug_batch_shape
+        noise_prior = GammaPrior(hyperparameters.get('noise_concentration',1.1), hyperparameters.get('noise_rate',0.05))
+        noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
+        likelihood = GaussianLikelihood(
+            noise_prior=noise_prior,
+            batch_shape=aug_batch_shape,
+            noise_constraint=GreaterThan(
+                MIN_INFERRED_NOISE_LEVEL,
+                transform=None,
+                initial_value=noise_prior_mode,
+            ),
+        )
+        model = SingleTaskGP(x, y.unsqueeze(-1),
+                             covar_module=gpytorch.kernels.ScaleKernel(
+                                gpytorch.kernels.MaternKernel(
+                                    nu=hyperparameters.get('nu',2.5),
+                                    ard_num_dims=x.shape[-1],
+                                    batch_shape=aug_batch_shape,
+                                    lengthscale_prior=gpytorch.priors.GammaPrior(hyperparameters.get('lengthscale_concentration',3.0), hyperparameters.get('lengthscale_rate',6.0)),
+                                ),
+                                batch_shape=aug_batch_shape,
+                                outputscale_prior=gpytorch.priors.GammaPrior(hyperparameters.get('outputscale_concentration',.5), hyperparameters.get('outputscale_rate',0.15)),
+                            ), likelihood=likelihood)
+        likelihood = model.likelihood
+        model.to(x.device)
+    if sample:
+        sampled_model = model.pyro_sample_from_prior()
+        return sampled_model, sampled_model.likelihood
+    else:
+        assert not(hyperparameters.get('sigmoid', False)) and not(hyperparameters.get('y_minmax_norm', False)), "Sigmoid and y_minmax_norm can only be used to sample models..."
+        return model, likelihood
+@torch.no_grad()
+def get_batch(batch_size, seq_len, num_features, device=default_device, hyperparameters=None,
+              batch_size_per_gp_sample=None,
+              fix_to_range=None, equidistant_x=False, **kwargs):
+    '''
+    This function is very similar to the equivalent in .fast_gp. The only difference is that this function operates over
+    a mixture of GP priors.
+    :param batch_size:
+    :param seq_len:
+    :param num_features:
+    :param device:
+    :param hyperparameters:
+    :param for_regression:
+    :return:
+    '''
+    hyperparameters = hyperparameters or {}
+    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))):
+        batch_size_per_gp_sample = (batch_size_per_gp_sample or max(batch_size // 10,1))
+        assert batch_size % batch_size_per_gp_sample == 0
+        total_num_candidates = batch_size*(2**(fix_to_range is not None))
+        num_candidates = batch_size_per_gp_sample * (2**(fix_to_range is not None))
+        if equidistant_x:
+            assert num_features == 1
+            x = torch.linspace(0,1.,seq_len).unsqueeze(0).repeat(total_num_candidates,1).unsqueeze(-1)
+        else:
+            x = torch.rand(total_num_candidates, seq_len, num_features, device=device)
+        samples = []
+        samples_wo_noise = []
+        for i in range(0,total_num_candidates,num_candidates):
+            model, likelihood = get_model(x[i:i+num_candidates], torch.zeros(num_candidates,x.shape[1]).to(device), hyperparameters)
+            model.to(device)
+            likelihood.to(device)
+            if hyperparameters.get('handmade', False):
+                model.covar_module.base_kernel.lengthscale = model.covar_module.base_kernel.lengthscale.to(device)
+                model.covar_module.outputscale = model.covar_module.outputscale.to(device)
+                likelihood.noise = likelihood.noise.to(device)
+                model.mean_module.constant = model.mean_module.constant.to(device)
+            # trained_model = ExactGPModel(train_x, train_y, likelihood).cuda()
+            # trained_model.eval()
+            successful_sample = 0
+            throwaway_share = 0.
+            sampling_with_observation_noise =  hyperparameters.get("observation_noise", True)
+            while successful_sample < 1:
+                with gpytorch.settings.prior_mode(True):
+                    #print(x.device, device, f'{model.covar_module.base_kernel.lengthscale=}, {model.covar_module.base_kernel.lengthscale.device=}')
+                    if sampling_with_observation_noise :
+                        d = model(x[i:i+num_candidates])
+                        d = likelihood(d)
+                        sample = d.sample() # bs_per_gp_s x T
+                    else:
+                        d = model(x[i:i+num_candidates])
+                        sample_wo_noise = d.sample()
+                        sample = likelihood(sample_wo_noise).sample()
+                    if hyperparameters.get('y_minmax_norm'):
+                        sample = ((sample - sample.min(1)[0]) / (sample.max(1)[0] - sample.min(1)[0]))
+                    if hyperparameters.get('sigmoid'):
+                        sample = sample.sigmoid()
+                    if not sampling_with_observation_noise:
+                        if hyperparameters.get('y_minmax_norm'):
+                            sample_wo_noise = ((sample_wo_noise - sample_wo_noise.min(1)[0]) / (sample_wo_noise.max(1)[0] - sample_wo_noise.min(1)[0]))
+                        if hyperparameters.get('sigmoid'):
+                            sample_wo_noise = sample_wo_noise.sigmoid()
+                    if fix_to_range is None:
+                        samples.append(sample.transpose(0, 1))
+                        if not sampling_with_observation_noise: samples_wo_noise.append(sample_wo_noise.transpose(0,1))
+                        successful_sample = True
+                        continue
+                    smaller_mask = sample < fix_to_range[0]
+                    larger_mask = sample >= fix_to_range[1]
+                    in_range_mask = ~ (smaller_mask | larger_mask).any(1)
+                    throwaway_share += (~in_range_mask[:batch_size_per_gp_sample]).sum()/batch_size_per_gp_sample
+                    if in_range_mask.sum() < batch_size_per_gp_sample:
+                        successful_sample -= 1
+                        if successful_sample < 100:
+                            print("Please change hyper-parameters (e.g. decrease outputscale_mean) it"
+                                "seems like the range is set to tight for your hyper-parameters.")
+                        continue
+                    x[i:i+batch_size_per_gp_sample] = x[i:i+num_candidates][in_range_mask][:batch_size_per_gp_sample]
+                    sample = sample[in_range_mask][:batch_size_per_gp_sample]
+                    samples.append(sample.transpose(0,1))
+                    if not sampling_with_observation_noise: samples_wo_noise.append(sample_wo_noise.transpose(0,1))
+                    successful_sample = True
+        if random.random() < .01:
+            print('throwaway share', throwaway_share/(batch_size//batch_size_per_gp_sample))
+        #print(f'took {time.time() - start}')
+        x = x.view(-1,batch_size,seq_len,num_features)[0]
+        # TODO think about enabling the line below
+        #sample = sample - sample[0, :].unsqueeze(0).expand(*sample.shape)
+        x = x.transpose(0,1)
+        sample = torch.cat(samples, 1)
+        if sampling_with_observation_noise:
+            target_sample = sample
+        else:
+            target_sample = torch.cat(samples_wo_noise, 1)
+        assert x.shape[:2] == sample.shape[:2]
+    return x, sample, target_sample # x.shape = (T,B,H)
+class DataLoader(get_batch_to_dataloader(get_batch)):
+    @torch.no_grad()
+    def validate(self, model, step_size=1, start_pos=0):
+        if isinstance(model.criterion, BarDistribution):
+            (_, x,y), target_y, eval_pos = self.gbm(**self.get_batch_kwargs)
+            model.eval()
+            losses = []
+            for eval_pos in range(start_pos, len(x), step_size):
+                logits = model((x,y), single_eval_pos=eval_pos)
+                means = model.criterion.mean(logits) # num_evals x batch_size
+                mse = nn.MSELoss()
+                losses.append(mse(means[0], target_y[eval_pos]))
+            model.train()
+            return torch.stack(losses)
+        else:
+            return 123.
+@torch.enable_grad()
+def get_fitted_model(x, y, hyperparameters, device):
+    # fit the gaussian process
+    model, likelihood = get_model(x,y,hyperparameters,sample=False)
+    #print(model.covar_module.base_kernel.lengthscale)
+    model.to(device)
+    mll = ExactMarginalLogLikelihood(likelihood, model)
+    model.train()
+    fit_gpytorch_model(mll)
+    #print(model.covar_module.base_kernel.lengthscale)
+    return model, likelihood
+evaluate = functools.partial(fast_gp.evaluate, get_model_on_device=get_fitted_model)
+def get_mcmc_model(x, y, hyperparameters, device, num_samples, warmup_steps, obs=True):
+    from pyro.infer.mcmc import NUTS, MCMC, HMC
+    import pyro
+    x = x.to(device)
+    y = y.to(device)
+    model, likelihood = get_model(x, y, hyperparameters, sample=False)
+    model.to(device)
+    def pyro_model(x, y):
+        sampled_model = model.pyro_sample_from_prior()
+        output = sampled_model.likelihood(sampled_model(x))
+        if obs:
+            return pyro.sample("obs", output, obs=y)
+    nuts_kernel = NUTS(pyro_model)
+    mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps, num_chains=1)
+    #print(x.shape)
+    mcmc_run.run(x, y)
+    #print(mcmc_run.get_samples())
+    model.pyro_load_from_samples(mcmc_run.get_samples()) # pyro.infer wie noah?
+    model.eval()
+    #print(mcmc_run.diagnostics())
+    # test_x = torch.linspace(0, 1, 101).unsqueeze(-1)
+    # test_y = torch.sin(test_x * (2 * math.pi))
+    # expanded_test_x = test_x.unsqueeze(0).repeat(num_samples, 1, 1)
+    # output = model(expanded_test_x)
+    #print(x.shape)
+    return model, likelihood
+    # output = model(x[-1].unsqueeze(1).repeat(1, num_samples 1))
+    # return output.mean
+def get_mean_logdensity(dists, x: torch.Tensor, full_range=None):
+    means = torch.cat([d.mean.squeeze() for d in dists], 0)
+    vars = torch.cat([d.variance.squeeze() for d in dists], 0)
+    assert len(means.shape) == 1 and len(vars.shape) == 1
+    dist = torch.distributions.Normal(means, vars.sqrt())
+    #logprobs = torch.cat([d.log_prob(x) for d in dists], 0)
+    logprobs = dist.log_prob(x)
+    if full_range is not None:
+        used_weight = 1. - (dist.cdf(torch.tensor(full_range[0])) + (1.-dist.cdf(torch.tensor(full_range[1]))))
+        if torch.isinf(-torch.log(used_weight)).any() or torch.isinf(torch.log(used_weight)).any():
+            print('factor is inf', -torch.log(used_weight))
+        logprobs -= torch.log(used_weight)
+    assert len(logprobs.shape) == 1
+    #print(logprobs)
+    return torch.logsumexp(logprobs, 0) - math.log(len(logprobs))
+def evaluate_(x, y, y_non_noisy, hyperparameters=None, device=default_device, num_samples=100, warmup_steps=300,
+              full_range=None, min_seq_len=0, use_likelihood=False, obs=True):
+    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))), gpytorch.settings.fast_pred_var(False):
+        x = x.to(device).double()
+        y = y.to(device).double()
+        start_time = time.time()
+        losses_after_t = [.0] if min_seq_len == 0 else []
+        all_losses = []
+        for t in range(max(min_seq_len,1), len(x)):
+            #print('Timestep', t)
+            loss_sum = 0.
+            step_losses = []
+            start_step = time.time()
+            print(x.shape, y.shape)
+            for b_i in range(x.shape[1]):
+                x_train = x[:t,b_i]
+                y_train = y[:t,b_i]
+                from pyro.infer.mcmc import NUTS, MCMC, HMC
+                import pyro
+                x_train = x_train.to(device)
+                y_train = y_train.to(device)
+                print(x_train.shape, y_train.shape)
+                model, likelihood = get_model(x_train, y_train, hyperparameters, sample=False)
+                model.to(device)
+                def pyro_model(x, y):
+                    sampled_model = model.pyro_sample_from_prior()
+                    output = sampled_model.likelihood(sampled_model(x))
+                    if obs:
+                        return pyro.sample("obs", output, obs=y)
+                nuts_kernel = NUTS(pyro_model)
+                mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps, num_chains=1, disable_progbar=True)
+                # print(x.shape)
+                mcmc_run.run(x_train, y_train)
+                # print(mcmc_run.get_samples())
+                model.pyro_load_from_samples(mcmc_run.get_samples())
+                model.eval()
+                with torch.no_grad():
+                    dists = model(x[t, b_i, :].unsqueeze(
+                        0).repeat(num_samples, 1, 1))
+                    if use_likelihood:
+                        dists = likelihood(dists)
+                    l = -get_mean_logdensity([dists], y[t, b_i].repeat(num_samples), full_range)
+                    print(l)
+                step_losses.append(l.item())
+                #print('loss',l.item())
+                print(f'current average loss at step {t} is {sum(step_losses)/len(step_losses)} with {(time.time()-start_step)/len(step_losses)} s per eval.')
+                loss_sum += l
+            loss_sum /= x.shape[1]
+            all_losses.append(step_losses)
+            print(f'loss after step {t} is {loss_sum}')
+            losses_after_t.append(loss_sum)
+            print(f'losses so far {torch.tensor(losses_after_t)}')
+        return torch.tensor(losses_after_t), time.time() - start_time, all_losses
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch_size', type=int)
+    parser.add_argument('--seq_len', type=int)
+    parser.add_argument('--min_seq_len', type=int, default=0)
+    parser.add_argument('--warmup_steps', type=int)
+    parser.add_argument('--num_samples', type=int)
+    parser.add_argument('--min_y', type=int)
+    parser.add_argument('--max_y', type=int)
+    parser.add_argument('--dim', type=int, default=1)
+    parser.add_argument('--use_likelihood', action='store_true')
+    parser.add_argument('--device', default='cpu')
+    parser.add_argument('--outputscale_concentraion', default=2., type=float)
+    parser.add_argument('--noise_concentration', default=1.1, type=float)
+    parser.add_argument('--noise_rate', default=.05, type=float)
+    parser.add_argument('--handmade', action='store_true')
+    parser.add_argument('--no_obs', action='store_true')
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+    import pyro
+    import gpytorch
+    print(pyro.__version__)
+    print(gpytorch.__version__)
+    print('min_y:', args.min_y)
+    full_range = (None if args.min_y is None else (args.min_y,args.max_y))
+    hps = {'handmade': args.handmade, 'outputscale_concentration': args.outputscale_concentraion, 'noise_concentration': args.noise_concentration,
+           'noise_rate': args.noise_rate, 'fast_computations': (False,False,False)}
+    if args.seed:
+        torch.manual_seed(args.seed)
+        np.random.seed(args.seed)
+        random.seed(args.seed)
+    x, y, _ = get_batch(args.batch_size, args.seq_len, args.dim, fix_to_range=full_range, hyperparameters=hps)
+    #assert args.seq_len == 7 and args.min_seq_len == 6
+    #x = torch.cat([torch.linspace(0, 1, 6), torch.tensor([.33])]).unsqueeze(1).repeat(1,args.batch_size).unsqueeze(-1)
+    #y = torch.sin(x * (2 * math.pi)).squeeze(-1)
+    print('RESULT:', evaluate_(x, y, y, device=args.device, warmup_steps=args.warmup_steps,
+                               num_samples=args.num_samples, full_range=full_range, min_seq_len=args.min_seq_len,
+                               hyperparameters=hps, use_likelihood=args.use_likelihood, obs=not args.no_obs))

lcpfn/priors/gp.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import time
+import random
+import numpy as np
+import torch
+from torch import nn
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel
+from .utils import get_batch_to_dataloader
+length_scale_sampling_gp = .6
+def get_gp(length_scale=None):
+    return GaussianProcessRegressor(
+        kernel=RBF(length_scale=length_scale or length_scale_sampling_gp, length_scale_bounds='fixed'),
+        random_state=0, optimizer=None)
+def get_batch(batch_size, seq_len, num_features, noisy_std=None):
+    # m = torch.normal(0.,.1,size=(batch_size,num_features))
+    # m2 = torch.rand(batch_size,num_features)
+    # b = 0 # torch.rand(batch_size)
+    x_t = torch.rand(batch_size, seq_len, num_features)
+    # gp_b = TensorGP(kernel=TensorRBF(noisy_std))
+    # y_t = gp_b.sample_from_GP_prior(x_t).detach()
+    gpr = get_gp(noisy_std)
+    y_t = torch.zeros(batch_size, seq_len)
+    for i in range(len(y_t)):
+        y_t[i] += gpr.sample_y(x_t[i], random_state=random.randint(0, 2 ** 32)).squeeze()
+    x, y = x_t.transpose(0, 1), y_t.transpose(0, 1)
+    # x, _ = torch.sort(x,dim=0)
+    return x, y, y
+DataLoader = get_batch_to_dataloader(get_batch)
+def evaluate(x, y, y_non_noisy, use_mse=False, length_scale=length_scale_sampling_gp):
+    start_time = time.time()
+    losses_after_t = [.0]
+    for t in range(1, len(x)):
+        loss_sum = 0.
+        for b_i in range(x.shape[1]):
+            gpr = get_gp(length_scale).fit(x[:t, b_i], y[:t, b_i])
+            means, stds = gpr.predict(x[t, b_i].unsqueeze(0), return_std=True)
+            assert len(means) == 1 == len(stds)
+            if use_mse:
+                c = nn.MSELoss()
+                l = c(torch.tensor(means), y[t, b_i].unsqueeze(-1))
+            else:
+                c = nn.GaussianNLLLoss(full=True)
+                l = c(torch.tensor(means), y[t, b_i].unsqueeze(-1),
+                      var=torch.tensor(stds) ** 2)
+            loss_sum += l
+        losses_after_t.append(loss_sum / x.shape[1])
+    return torch.tensor(losses_after_t), time.time()-start_time
+if __name__ == '__main__':
+    ls = .1
+    for alpha in set([ls, ls * 1.1, ls * .9]):
+        print(alpha)
+        for redo_idx in range(1):
+            print(
+                evaluate(*get_batch(1000, 10, noisy_std=ls, num_features=10), use_mse=False, length_scale=alpha))

lcpfn/priors/prior.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from abc import ABCMeta, abstractmethod
+from torch.utils.data import DataLoader
+class PriorDataLoader(DataLoader, metaclass=ABCMeta):
+    @abstractmethod
+    def __init__(self, num_steps, batch_size, eval_pos_seq_len_sampler, seq_len_maximum, device, **kwargs):
+        """
+        :param num_steps: int, first argument, the number of steps to take per epoch, i.e. iteration of the DataLoader
+        :param batch_size: int, number of datasets per batch
+        :param eval_pos_seq_len_sampler: callable, it takes no arguments and returns a tuple (single eval pos, bptt)
+        :param kwargs: for future compatibility it is good to have a final all catch, as new kwargs might be introduced
+        """
+        pass
+    # A class or object variable `num_features`: int
+    # Optional: `validate` function that accepts a transformer model
+    # The DataLoader iter should return batches of the form ([style], x, y), target_y, single_eval_pos
+    # We follow sequence len (s) first, batch size (b) second. So x: (s,b,num_features), y,target_y: (s,b)
+    # and style: Optional[(b,num_style_params)], style can be omitted or set to None, if it is not intended to be used.
+    # For more references, see `priors/utils.py` for a pretty general implementation of a DataLoader
+    # and `train.py` for the only call of it.

lcpfn/priors/pyro.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import random
+import torch
+from torch import nn
+from utils import default_device
+from .utils import get_batch_to_dataloader
+def get_batch(batch_size, seq_len, batch_size_per_gp_sample=None, **config):
+    batch_size_per_gp_sample = batch_size_per_gp_sample or batch_size // 16
+    assert batch_size % batch_size_per_gp_sample == 0, 'Please choose a batch_size divisible by batch_size_per_gp_sample.'
+    num_models = batch_size // batch_size_per_gp_sample
+    # standard kaiming uniform init currently...
+    models = [config['model']() for _ in range(num_models)]
+    sample = sum([[model(seq_len=seq_len) for _ in range(0,batch_size_per_gp_sample)] for model in models],[])
+    def normalize_data(data):
+        mean = data.mean(0)
+        std = data.std(0) + .000001
+        eval_xs = (data - mean) / std
+        return eval_xs
+    x, y = zip(*sample)
+    y = torch.stack(y, 1).squeeze(-1).detach()
+    x = torch.stack(x, 1).detach()
+    if 'normalize_y' in config and config['normalize_y']:
+        x, y = normalize_data(x), normalize_data(y)
+    elif 'normalize_y' in config and config['normalize']:
+        x, y = normalize_data(x), y
+    return x, y, y
+DataLoader = get_batch_to_dataloader(get_batch)

lcpfn/priors/ridge.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import random
+import time
+import numpy as np
+import torch
+from torch import nn
+from sklearn.linear_model import Ridge
+from .utils import get_batch_to_dataloader
+def get_batch(batch_size, seq_len, num_features, noisy_std = .1):
+    m = torch.normal(0., .1, size=(batch_size,num_features))
+    b = 0 # torch.rand(batch_size)
+    x = torch.rand(seq_len, batch_size,num_features)
+    y_non_noisy = torch.einsum('bf,tbf->tb',m,x)
+    y = y_non_noisy + torch.normal(torch.zeros_like(y_non_noisy),noisy_std) # noisy_std is alpha
+    return x, y, y_non_noisy
+DataLoader = get_batch_to_dataloader(get_batch)
+def evaluate(x,y,y_non_noisy, alpha=0.):
+    start_time = time.time()
+    losses_after_t = [.0]
+    for t in range(1,len(x)):
+        loss_sum = 0.
+        for b_i in range(x.shape[1]):
+            clf = Ridge(alpha=alpha)
+            clf.fit(x[:t,b_i],y[:t,b_i])
+            y_ = clf.predict(x[t,b_i].unsqueeze(0))
+            l = nn.MSELoss()(y_non_noisy[t,b_i].unsqueeze(0),torch.tensor(y_))
+            loss_sum += l
+        losses_after_t.append(loss_sum/x.shape[1])
+    return torch.tensor(losses_after_t), time.time()-start_time
+if __name__ == '__main__':
+    for alpha in [.001,.01,.5,1.]:
+        print(alpha, evaluate(*get_batch(1000,10,noisy_std=.01),alpha=alpha))

lcpfn/priors/stroke.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from PIL import Image, ImageDraw, ImageFilter
+import random
+import math
+import torch
+import numpy as np
+from .utils import get_batch_to_dataloader
+def mnist_prior(num_classes=2, size=28, min_max_strokes=(1,3), min_max_len=(5/28,20/28), min_max_start=(2/28,25/28),
+                min_max_width=(1/28,4/28), max_offset=4/28, max_target_offset=2/28):
+    classes = []
+    for i in range(num_classes):
+        num_strokes = random.randint(*min_max_strokes)
+        len_strokes = [random.randint(int(size * min_max_len[0]), int(size * min_max_len[1])) for i in range(num_strokes)]
+        stroke_start_points = [
+            (random.randint(int(size * min_max_start[0]), int(size * min_max_start[1])), random.randint(int(size * min_max_start[0]), int(size * min_max_start[1]))) for i in
+            range(num_strokes)]
+        stroke_directions = []
+        # i = Image.fromarray(np.zeros((28,28),dtype=np.uint8))
+        # draw = ImageDraw.Draw(i)
+        for i in range(num_strokes):
+            sp, length = stroke_start_points[i], len_strokes[i]
+            counter = 0
+            while True:
+                if counter % 3 == 0:
+                    length = random.randint(int(size * min_max_len[0]), int(size * min_max_len[1]))
+                    sp = (
+                    random.randint(int(size * min_max_start[0]), int(size * min_max_start[1])), random.randint(int(size * min_max_start[0]), int(size * min_max_start[1])))
+                    stroke_start_points[i], len_strokes[i] = sp, length
+                radians = random.random() * 2 * math.pi
+                x_vel = math.cos(radians) * length
+                y_vel = math.sin(radians) * length
+                new_p = (sp[0] + x_vel, sp[1] + y_vel)
+                # print(math.degrees(radians),sp,new_p)
+                if not any(n > size - 1 or n < 0 for n in new_p):
+                    break
+                counter += 1
+            stroke_directions.append(radians)
+            # print([round(x) for x in sp+new_p])
+            # draw.line([round(x) for x in sp+new_p], fill=128, width=3)
+        classes.append((len_strokes, stroke_start_points, stroke_directions))
+    generator_functions = []
+    for c in classes:
+        def g(c=c):
+            len_strokes, stroke_start_points, stroke_directions = c
+            i = Image.fromarray(np.zeros((size, size), dtype=np.uint8))
+            draw = ImageDraw.Draw(i)
+            width = random.randint(int(size * min_max_width[0]), int(size * min_max_width[1]))
+            offset = random.randint(int(-size * max_offset), int(size * max_offset)), random.randint(int(- size * max_offset), int(size * max_offset))
+            for sp, length, radians in zip(stroke_start_points, len_strokes, stroke_directions):
+                sp = (sp[0] + offset[0], sp[1] + offset[1])
+                x_vel = math.cos(radians) * length + random.randint(int(-size * max_target_offset), int(size * max_target_offset))
+                y_vel = math.sin(radians) * length + random.randint(int(-size * max_target_offset), int(size * max_target_offset))
+                new_p = (sp[0] + x_vel, sp[1] + y_vel)
+                stroke_directions.append(radians)
+                draw.line([round(x) for x in sp + new_p], fill=128, width=width)
+            a_i = np.array(i)
+            a_i[a_i == 128] = np.random.randint(200, 255, size=a_i.shape)[a_i == 128]
+            return Image.fromarray(a_i).filter(ImageFilter.GaussianBlur(.2))
+        generator_functions.append(g)
+    return generator_functions
+# g1,g2 = mnist_prior(2)
+# for i in [g1() for _ in range(10)]:
+#    display(i.resize((200,200)))
+from torchvision.transforms import ToTensor, ToPILImage
+def normalize(x):
+    return (x-x.mean())/(x.std()+.000001)
+from os import path, listdir
+import random
+def get_batch(batch_size, seq_len, num_features=None, noisy_std=None, only_train_for_last_idx=False, normalize_x=False, num_outputs=2, use_saved_from=None, **kwargs):  # num_features = 28*28=784
+    if use_saved_from is not None:
+        directory = path.join(use_saved_from, f'len_{seq_len}_out_{num_outputs}_features_{num_features}_bs_{batch_size}')
+        filename = random.choice(listdir(directory))
+        return torch.load(path.join(directory,filename))
+    size = math.isqrt(num_features)
+    assert size * size == num_features, 'num_features needs to be the square of an integer.'
+    if only_train_for_last_idx:
+        assert (seq_len-1) % num_outputs == 0
+    # assert seq_len % 2 == 0, "assert seq_len % 2 == 0"
+    batch = []
+    y = []
+    target_y = []
+    for b_i in range(batch_size):
+        gs = mnist_prior(num_outputs, size, **kwargs)
+        if only_train_for_last_idx:
+            generators = [i for i in range(len(gs)) for _ in range((seq_len-1) // num_outputs)]
+            random.shuffle(generators)
+            generators += [random.randint(0, len(gs) - 1)]
+            target = [-100 for _ in generators]
+            target[-1] = generators[-1]
+        else:
+            generators = [random.randint(0, len(gs) - 1) for _ in range(seq_len)]
+            target = generators
+        normalize_or_not = lambda x: normalize(x) if normalize_x else x
+        s = torch.cat([normalize_or_not(ToTensor()(gs[f_i]())) for f_i in generators], 0)
+        batch.append(s)
+        y.append(torch.tensor(generators))
+        target_y.append(torch.tensor(target))
+    x = torch.stack(batch, 1).view(seq_len, batch_size, -1)
+    y = torch.stack(y, 1)
+    target_y = torch.stack(target_y, 1)
+    return x,y,target_y
+DataLoader = get_batch_to_dataloader(get_batch)
+DataLoader.num_outputs = 2
+if __name__ == '__main__':
+    g1, g2 = mnist_prior(2, size=3)
+    # for i in range(10):
+    # print(PILToTensor()(g1()))
+    # display(ToPILImage()(PILToTensor()(g1())).resize((200,200)))
+    # display(g2().resize((200,200)))
+    size = 10
+    x, y = get_batch(1, 10, num_features=size * size)
+    x_ = x[..., :-1].squeeze(1)
+    last_y = x[..., -1].squeeze(1)
+    y = y.squeeze(1)
+    # print(y)
+    for i, y_, last_y_, x__ in zip(x_, y, last_y, x.squeeze(1)):
+        # print(y_)
+        # print(i.shape)
+        # print(x__)
+        img = ToPILImage()(i.view(size, size))
+        # display(img.resize((200,200)))
+    print(y, last_y)

lcpfn/priors/utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import random
+import pandas as pd
+import torch
+from lcpfn.utils import set_locals_in_self
+from itertools import repeat
+from .prior import PriorDataLoader
+from torch import nn
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import scipy.stats as stats
+import math
+def get_batch_to_dataloader(get_batch_method_):
+    class DL(PriorDataLoader):
+        get_batch_method = get_batch_method_
+        # Caution, you might need to set self.num_features manually if it is not part of the args.
+        def __init__(self, num_steps, **get_batch_kwargs):
+            set_locals_in_self(locals())
+            # The stuff outside the or is set as class attribute before instantiation.
+            self.num_features = get_batch_kwargs.get('num_features') or self.num_features
+            print('DataLoader.__dict__', self.__dict__)
+        @staticmethod
+        def gbm(*args, eval_pos_seq_len_sampler, **kwargs):
+            kwargs['single_eval_pos'], kwargs['seq_len'] = eval_pos_seq_len_sampler()
+            # Scales the batch size dynamically with the power of 'dynamic_batch_size'.
+            # A transformer with quadratic memory usage in the seq len would need a power of 2 to keep memory constant.
+            if 'dynamic_batch_size' in kwargs and kwargs['dynamic_batch_size'] > 0:
+                kwargs['batch_size'] = kwargs['batch_size'] * math.floor(math.pow(kwargs['seq_len_maximum'], kwargs['dynamic_batch_size']) / math.pow(kwargs['seq_len'], kwargs['dynamic_batch_size']))
+            batch = get_batch_method_(*args, **kwargs)
+            x, y, target_y, style = batch if len(batch) == 4 else (batch[0], batch[1], batch[2], None)
+            return (style, x, y), target_y, kwargs['single_eval_pos']
+        def __len__(self):
+            return self.num_steps
+        def __iter__(self):
+            return iter(self.gbm(**self.get_batch_kwargs) for _ in range(self.num_steps))
+    return DL
+"""
+import seaborn as sns
+def plot_features(data, targets, fig=None):
+    if torch.is_tensor(data):
+        data = data.detach().cpu().numpy()
+        targets = targets.detach().cpu().numpy()
+    fig2 = plt.figure(figsize=(8, 8))
+    spec2 = gridspec.GridSpec(ncols=data.shape[1], nrows=data.shape[1], figure=fig2)
+    for d in range(0, data.shape[1]):
+        for d2 in range(0, data.shape[1]):
+            sub_ax = fig2.add_subplot(spec2[d, d2])
+            if d == d2:
+                sns.kdeplot(data[:, d],hue=targets[:],ax=sub_ax,legend=False, palette="deep")
+                sub_ax.set(ylabel=None)
+            else:
+                sns.scatterplot(data[:, d], data[:, d2],
+                           hue=targets[:],legend=False, palette="deep")
+                #plt.scatter(data[:, d], data[:, d2],
+                #               c=targets[:])
+            sub_ax.get_xaxis().set_ticks([])
+            sub_ax.get_yaxis().set_ticks([])
+    plt.subplots_adjust(wspace=0.05, hspace=0.05)
+    fig2.show()
+def plot_prior(prior):
+    s = np.array([prior() for _ in range(0, 1000)])
+    count, bins, ignored = plt.hist(s, 50, density=True)
+    print(s.min())
+    plt.show()
+"""
+trunc_norm_sampler_f = lambda mu, sigma : lambda: stats.truncnorm((0 - mu) / sigma, (1000000 - mu) / sigma, loc=mu, scale=sigma).rvs(1)[0]
+beta_sampler_f = lambda a, b : lambda : np.random.beta(a, b)
+gamma_sampler_f = lambda a, b : lambda : np.random.gamma(a, b)
+uniform_sampler_f = lambda a, b : lambda : np.random.uniform(a, b)
+uniform_int_sampler_f = lambda a, b : lambda : round(np.random.uniform(a, b))
+def zipf_sampler_f(a, b, c):
+    x = np.arange(b, c)
+    weights = x ** (-a)
+    weights /= weights.sum()
+    return lambda : stats.rv_discrete(name='bounded_zipf', values=(x, weights)).rvs(1)
+scaled_beta_sampler_f = lambda a, b, scale, minimum : lambda : minimum + round(beta_sampler_f(a, b)() * (scale - minimum))
+def normalize_by_used_features_f(x, num_features_used, num_features, normalize_with_sqrt=False):
+    if normalize_with_sqrt:
+        return x / (num_features_used / num_features)**(1 / 2)
+    return x / (num_features_used / num_features)
+def order_by_y(x, y):
+    order = torch.argsort(y if random.randint(0, 1) else -y, dim=0)[:, 0, 0]
+    order = order.reshape(2, -1).transpose(0, 1).reshape(-1)#.reshape(seq_len)
+    x = x[order]  # .reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).reshape(seq_len, 1, -1)
+    y = y[order]  # .reshape(2, -1).transpose(0, 1).reshape(-1).reshape(seq_len, 1, -1)
+    return x, y
+def randomize_classes(x, num_classes):
+    classes = torch.arange(0, num_classes, device=x.device)
+    random_classes = torch.randperm(num_classes, device=x.device).type(x.type())
+    x = ((x.unsqueeze(-1) == classes) * random_classes).sum(-1)
+    return x
+class CategoricalActivation(nn.Module):
+    def __init__(self, categorical_p=0.1, ordered_p=0.7
+                 , keep_activation_size=False
+                 , num_classes_sampler=zipf_sampler_f(0.8, 1, 10)):
+        self.categorical_p = categorical_p
+        self.ordered_p = ordered_p
+        self.keep_activation_size = keep_activation_size
+        self.num_classes_sampler = num_classes_sampler
+        super().__init__()
+    def forward(self, x):
+        # x shape: T, B, H
+        x = nn.Softsign()(x)
+        num_classes = self.num_classes_sampler()
+        hid_strength = torch.abs(x).mean(0).unsqueeze(0) if self.keep_activation_size else None
+        categorical_classes = torch.rand((x.shape[1], x.shape[2])) < self.categorical_p
+        class_boundaries = torch.zeros((num_classes - 1, x.shape[1], x.shape[2]), device=x.device, dtype=x.dtype)
+        # Sample a different index for each hidden dimension, but shared for all batches
+        for b in range(x.shape[1]):
+            for h in range(x.shape[2]):
+                ind = torch.randint(0, x.shape[0], (num_classes - 1,))
+                class_boundaries[:, b, h] = x[ind, b, h]
+        for b in range(x.shape[1]):
+            x_rel = x[:, b, categorical_classes[b]]
+            boundaries_rel = class_boundaries[:, b, categorical_classes[b]].unsqueeze(1)
+            x[:, b, categorical_classes[b]] = (x_rel > boundaries_rel).sum(dim=0).float() - num_classes / 2
+        ordered_classes = torch.rand((x.shape[1],x.shape[2])) < self.ordered_p
+        ordered_classes = torch.logical_and(ordered_classes, categorical_classes)
+        x[:, ordered_classes] = randomize_classes(x[:, ordered_classes], num_classes)
+        x = x * hid_strength if self.keep_activation_size else x
+        return x

lcpfn/train.py ADDED Viewed

	@@ -0,0 +1,602 @@

+import os
+import itertools
+import argparse
+import time
+import datetime
+import yaml
+from contextlib import nullcontext
+import pickle
+import torch
+from torch import nn
+from lcpfn import utils
+from lcpfn.transformer import TransformerModel
+from lcpfn.bar_distribution import (
+    BarDistribution,
+    FullSupportBarDistribution,
+    get_bucket_limits,
+)
+from lcpfn.utils import (
+    get_cosine_schedule_with_warmup,
+    get_openai_lr,
+    StoreDictKeyPair,
+    get_weighted_single_eval_pos_sampler,
+    get_uniform_single_eval_pos_sampler,
+)
+from lcpfn import priors
+from lcpfn import encoders
+from lcpfn import positional_encodings
+from lcpfn.utils import init_dist
+from torch.cuda.amp import autocast, GradScaler
+class Losses:
+    gaussian = nn.GaussianNLLLoss(full=True, reduction="none")
+    mse = nn.MSELoss(reduction="none")
+    ce = lambda num_classes: nn.CrossEntropyLoss(
+        reduction="none", weight=torch.ones(num_classes)
+    )
+    bce = nn.BCEWithLogitsLoss(reduction="none")
+    get_BarDistribution = BarDistribution
+def train(
+    priordataloader_class,
+    criterion,
+    encoder_generator,
+    emsize=200,
+    nhid=200,
+    nlayers=6,
+    nhead=2,
+    dropout=0.2,
+    epochs=10,
+    steps_per_epoch=100,
+    batch_size=200,
+    bptt=10,
+    lr=None,
+    weight_decay=0.0,
+    warmup_epochs=10,
+    input_normalization=False,
+    y_encoder_generator=None,
+    pos_encoder_generator=None,
+    decoder=None,
+    extra_prior_kwargs_dict={},
+    scheduler=get_cosine_schedule_with_warmup,
+    load_weights_from_this_state_dict=None,
+    validation_period=10,
+    single_eval_pos_gen=None,
+    bptt_extra_samples=None,
+    gpu_device="cuda:0",
+    aggregate_k_gradients=1,
+    verbose=True,
+    style_encoder_generator=None,
+    epoch_callback=None,
+    initializer=None,
+    initialize_with_model=None,
+    train_mixed_precision=False,
+    saving_period=10,
+    checkpoint_file=None,
+    load_optimizer_from_this_state_dict=None,
+    output_path=None,
+    **model_extra_args,
+):
+    device = gpu_device if torch.cuda.is_available() else "cpu:0"
+    print(f"Using {device} device")
+    using_dist, rank, device = init_dist(device)
+    single_eval_pos_gen = (
+        single_eval_pos_gen
+        if callable(single_eval_pos_gen)
+        else lambda: single_eval_pos_gen
+    )
+    def eval_pos_seq_len_sampler():
+        single_eval_pos = single_eval_pos_gen()
+        if bptt_extra_samples:
+            return single_eval_pos, single_eval_pos + bptt_extra_samples
+        else:
+            return single_eval_pos, bptt
+    dl = priordataloader_class(
+        num_steps=steps_per_epoch,
+        batch_size=batch_size,
+        eval_pos_seq_len_sampler=eval_pos_seq_len_sampler,
+        seq_len_maximum=bptt + (bptt_extra_samples if bptt_extra_samples else 0),
+        device=device,
+        **extra_prior_kwargs_dict,
+    )
+    encoder = encoder_generator(dl.num_features, emsize)
+    style_def = next(iter(dl))[0][
+        0
+    ]  # This is (style, x, y), target with x and y with batch size
+    print(f"Style definition: {style_def}")
+    style_encoder = (
+        style_encoder_generator(hyperparameter_definitions=style_def[0], em_size=emsize)
+        if (style_def is not None)
+        else None
+    )
+    if isinstance(criterion, nn.GaussianNLLLoss):
+        n_out = 2
+    elif (
+        isinstance(criterion, BarDistribution)
+        or "BarDistribution" in criterion.__class__.__name__
+    ):  # TODO remove this fix (only for dev)
+        n_out = criterion.num_bars
+    elif isinstance(criterion, nn.CrossEntropyLoss):
+        n_out = criterion.weight.shape[0]
+    else:
+        n_out = 1
+    model = TransformerModel(
+        encoder,
+        n_out,
+        emsize,
+        nhead,
+        nhid,
+        nlayers,
+        dropout,
+        style_encoder=style_encoder,
+        y_encoder=y_encoder_generator(1, emsize),
+        input_normalization=input_normalization,
+        pos_encoder=(
+            pos_encoder_generator or positional_encodings.NoPositionalEncoding
+        )(emsize, bptt * 2),
+        decoder=decoder,
+        init_method=initializer,
+        **model_extra_args,
+    )
+    model.criterion = criterion
+    if load_weights_from_this_state_dict is not None:
+        model.load_state_dict(load_weights_from_this_state_dict)
+    if initialize_with_model is not None:
+        model.init_from_small_model(initialize_with_model)
+    print(
+        f"Using a Transformer with {sum(p.numel() for p in model.parameters())/1000/1000:.{2}f} M parameters"
+    )
+    try:
+        for (k, v), (k2, v2) in zip(
+            model.state_dict().items(), initialize_with_model.state_dict().items()
+        ):
+            print(k, ((v - v2) / v).abs().mean(), v.shape)
+    except Exception:
+        pass
+    model.to(device)
+    if using_dist:
+        print("Distributed training")
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[rank], output_device=rank, broadcast_buffers=False
+        )
+    # learning rate
+    if lr is None:
+        lr = get_openai_lr(model)
+        print(f"Using OpenAI max lr of {lr}.")
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    scheduler = scheduler(
+        optimizer, warmup_epochs, epochs if epochs is not None else 100
+    )  # when training for fixed time lr schedule takes 100 steps
+    if load_optimizer_from_this_state_dict is not None:
+        optimizer.load_state_dict(load_optimizer_from_this_state_dict)
+    scaler = GradScaler() if train_mixed_precision else None
+    # check that everything uses up-to-date APIs
+    utils.check_compatibility(dl)
+    def train_epoch():
+        model.train()  # Turn on the train mode
+        total_loss = 0.0
+        total_positional_losses = 0.0
+        total_positional_losses_recorded = 0
+        before_get_batch = time.time()
+        assert (
+            len(dl) % aggregate_k_gradients == 0
+        ), "Please set the number of steps per epoch s.t. `aggregate_k_gradients` divides it."
+        for batch, (data, targets, single_eval_pos) in enumerate(dl):
+            if using_dist and not (
+                batch % aggregate_k_gradients == aggregate_k_gradients - 1
+            ):
+                cm = model.no_sync()
+            else:
+                cm = nullcontext()
+            with cm:
+                time_to_get_batch = time.time() - before_get_batch
+                before_forward = time.time()
+                with autocast(enabled=scaler is not None):
+                    # If style is set to None, it should not be transferred to device
+                    output = model(
+                        tuple(e.to(device) if torch.is_tensor(e) else e for e in data)
+                        if isinstance(data, tuple)
+                        else data.to(device),
+                        single_eval_pos=single_eval_pos,
+                    )
+                    forward_time = time.time() - before_forward
+                    if single_eval_pos is not None:
+                        targets = targets[single_eval_pos:]
+                    if isinstance(criterion, nn.GaussianNLLLoss):
+                        assert (
+                            output.shape[-1] == 2
+                        ), "need to write a little bit of code to handle multiple regression targets at once"
+                        mean_pred = output[..., 0]
+                        var_pred = output[..., 1].abs()
+                        losses = criterion(
+                            mean_pred.flatten(),
+                            targets.to(device).flatten(),
+                            var=var_pred.flatten(),
+                        )
+                    elif isinstance(criterion, (nn.MSELoss, nn.BCEWithLogitsLoss)):
+                        losses = criterion(
+                            output.flatten(), targets.to(device).flatten()
+                        )
+                    elif isinstance(criterion, nn.CrossEntropyLoss):
+                        losses = criterion(
+                            output.reshape(-1, n_out),
+                            targets.to(device).long().flatten(),
+                        )
+                    else:
+                        losses = criterion(output, targets)
+                    losses = losses.view(*output.shape[0:2])
+                    loss = losses.mean() / aggregate_k_gradients
+                if scaler:
+                    loss = scaler.scale(loss)
+                loss.backward()
+                if batch % aggregate_k_gradients == aggregate_k_gradients - 1:
+                    if scaler:
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                    try:
+                        if scaler:
+                            scaler.step(optimizer)
+                            scaler.update()
+                        else:
+                            optimizer.step()
+                    except:
+                        print("Invalid optimization step encountered")
+                    optimizer.zero_grad()
+                step_time = time.time() - before_forward
+                if not torch.isnan(loss):
+                    total_loss += losses.mean().cpu().detach()
+                    total_positional_losses += (
+                        losses.mean(1).cpu().detach()
+                        if single_eval_pos is None
+                        else nn.functional.one_hot(torch.tensor(single_eval_pos), bptt)
+                        * losses[: bptt - single_eval_pos].mean().cpu().detach()
+                    )
+                    total_positional_losses_recorded += (
+                        torch.ones(bptt)
+                        if single_eval_pos is None
+                        else nn.functional.one_hot(torch.tensor(single_eval_pos), bptt)
+                    )
+            before_get_batch = time.time()
+        return (
+            total_loss / steps_per_epoch,
+            (total_positional_losses / total_positional_losses_recorded).tolist(),
+            time_to_get_batch,
+            forward_time,
+            step_time,
+        )
+    total_loss = float("inf")
+    total_positional_losses = float("inf")
+    list_losses = []
+    try:
+        for epoch in range(1, epochs + 1) if epochs is not None else itertools.count(1):
+            epoch_start_time = time.time()
+            (
+                total_loss,
+                total_positional_losses,
+                time_to_get_batch,
+                forward_time,
+                step_time,
+            ) = train_epoch()
+            list_losses.append(total_loss.item())
+            if hasattr(dl, "validate") and epoch % validation_period == 0:
+                with torch.no_grad():
+                    val_score = dl.validate(model)
+            else:
+                val_score = None
+            if epoch % saving_period == 0 and checkpoint_file is not None:
+                checkpoint = {
+                    "model_state_dict": model.state_dict(),
+                    "optimizer_state_dict": optimizer.state_dict(),
+                    "epoch": epoch,
+                }
+                torch.save(checkpoint, checkpoint_file)
+                full_model_path = checkpoint_file.split(".")[0] + "_full_model.pt"
+                torch.save(model, full_model_path)
+            if verbose:
+                print("-" * 89)
+                print(
+                    f"| end of epoch {epoch:3d} | time: {(time.time() - epoch_start_time):5.2f}s | mean loss {total_loss:5.2f} | "
+                    f"pos losses {','.join([f'{l:5.2f}' for l in total_positional_losses])}, lr {scheduler.get_last_lr()[0]}"
+                    f" data time {time_to_get_batch:5.2f} step time {step_time:5.2f}"
+                    f" forward time {forward_time:5.2f}"
+                    + (f"val score {val_score}" if val_score is not None else "")
+                )
+                print("-" * 89)
+            # stepping with wallclock time based scheduler
+            if epoch_callback is not None and rank == 0:
+                epoch_callback(model, epoch / epochs)
+            scheduler.step()
+    except KeyboardInterrupt:
+        pass
+    if rank == 0:  # trivially true for non-parallel training
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            model = model.module
+            dl = None
+        if output_path is not None:
+            torch.save(model.to("cpu"), output_path)
+            print("Checkpoint stored at ", output_path)
+        return total_loss, total_positional_losses, model.to("cpu"), dl
+def _parse_args(config_parser, parser):
+    # Do we have a config file to parse?
+    args_config, remaining = config_parser.parse_known_args()
+    if args_config.config:
+        with open(args_config.config, "r") as f:
+            cfg = yaml.safe_load(f)
+            parser.set_defaults(**cfg)
+    # The main arg parser parses the rest of the args, the usual
+    # defaults will have been overridden if config file specified.
+    args = parser.parse_args(remaining)
+    # Cache the args as a text string to save them in the output dir later
+    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+    return args, args_text
+if __name__ == "__main__":
+    config_parser = argparse.ArgumentParser(
+        description="Only used as a first parser for the config file path."
+    )
+    config_parser.add_argument("--config")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("prior")
+    parser.add_argument("--loss_function", default="barnll")
+    # Optional Arg's for `--loss_function barnll`
+    parser.add_argument(
+        "--min_y",
+        type=float,
+        help="barnll can only model y in strict ranges, this is the minimum y can take.",
+    )
+    parser.add_argument(
+        "--max_y",
+        type=float,
+        help="barnll can only model y in strict ranges, this is the maximum y can take.",
+    )
+    parser.add_argument("--num_buckets", default=100, type=int)
+    # parser.add_argument('--num_features', default=None, type=int, help='Specify depending on the prior.')
+    parser.add_argument(
+        "--extra_prior_kwargs_dict",
+        default={},
+        dest="extra_prior_kwargs_dict",
+        action=StoreDictKeyPair,
+        nargs="+",
+        metavar="KEY=VAL",
+        help="Specify depending on the prior.",
+    )
+    parser.add_argument(
+        "--encoder", default="linear", type=str, help="Specify depending on the prior."
+    )
+    parser.add_argument(
+        "--y_encoder",
+        default="linear",
+        type=str,
+        help="Specify depending on the prior. You should specify this if you do not fuse x and y.",
+    )
+    parser.add_argument(
+        "--pos_encoder",
+        default="none",
+        type=str,
+        help="Specify depending on the prior.",
+    )
+    parser.add_argument("--bptt", default=10, type=int)
+    parser.add_argument("--epochs", default=200, type=int)
+    parser.add_argument("--warmup_epochs", default=50, type=int)
+    parser.add_argument("--validation_period", default=10, type=int)
+    parser.add_argument(
+        "--permutation_invariant_max_eval_pos",
+        default=None,
+        type=int,
+        help="Set this to an int to ",
+    )
+    parser.add_argument(
+        "--permutation_invariant_sampling",
+        default="weighted",
+        help="Only relevant if --permutation_invariant_max_eval_pos is set.",
+    )
+    parser.add_argument("--train_mixed_precision", action="store_true")
+    # these can likely be mostly left at defaults
+    parser.add_argument(
+        "--emsize", default=512, type=int
+    )  # sometimes even larger is better e.g. 1024
+    parser.add_argument("--nlayers", default=6, type=int)
+    parser.add_argument("--nhid", default=None, type=int)  # 2*emsize is the default
+    parser.add_argument(
+        "--nhead", default=4, type=int
+    )  # nhead = emsize / 64 in the original paper
+    parser.add_argument("--dropout", default=0.0, type=float)
+    parser.add_argument("--steps_per_epoch", default=10, type=int)
+    parser.add_argument("--batch_size", default=1000, type=int)
+    parser.add_argument(
+        "--lr", "--learning_rate", default=0.001, type=float
+    )  # try also .0003, .0001, go lower with lower batch size
+    parser.add_argument("--gpu_device", default="cuda", type=str)
+    # for model checkpointing
+    parser.add_argument(
+        "--checkpoint_file",
+        help="absolute or relative-to-the-project-rootdir path to the file storing the state dicts.",
+        default=None,
+        type=str,
+    )
+    parser.add_argument("--saving_period", default=10, type=str)
+    args, _ = _parse_args(config_parser, parser)
+    if args.nhid is None:
+        args.nhid = 2 * args.emsize
+    prior = args.__dict__.pop("prior")
+    if prior == "gp":
+        prior = priors.fast_gp.DataLoader
+    elif prior == "ridge":
+        prior = priors.ridge.DataLoader
+    elif prior == "stroke":
+        prior = priors.stroke.DataLoader
+    elif prior == "mix_gp":
+        prior = priors.fast_gp_mix.DataLoader
+    else:
+        raise NotImplementedError(f"Prior == {prior}.")
+    loss_function = args.__dict__.pop("loss_function")
+    criterion = nn.GaussianNLLLoss(reduction="none", full=True)
+    classificiation_criterion = nn.CrossEntropyLoss(reduction="none")
+    num_buckets = args.__dict__.pop("num_buckets")
+    max_y = args.__dict__.pop("max_y")
+    min_y = args.__dict__.pop("min_y")
+    # criterion = nn.MSELoss(reduction='none')
+    device = args.gpu_device if torch.cuda.is_available() else "cpu:0"
+    def get_y_sample():
+        args.__dict__["extra_prior_kwargs_dict"]["eval_pos_seq_len_sampler"] = lambda: (
+            args.bptt,
+            args.bptt,
+        )
+        dl = prior(
+            num_steps=1,
+            batch_size=args.batch_size * args.steps_per_epoch,
+            seq_len=args.bptt,
+            device=device,
+            **args.extra_prior_kwargs_dict,
+        )
+        args.__dict__["extra_prior_kwargs_dict"].pop("eval_pos_seq_len_sampler")
+        y_sample = next(iter(dl))[-2]
+        print(
+            f"Creating Bar distribution with borders from y sample of size {y_sample.numel()}"
+        )
+        return y_sample
+    if loss_function == "ce":
+        criterion = nn.CrossEntropyLoss(reduction="none")
+    elif loss_function == "gaussnll":
+        criterion = nn.GaussianNLLLoss(reduction="none", full=True)
+    elif loss_function == "mse":
+        criterion = nn.MSELoss(reduction="none")
+    elif loss_function == "barnll":
+        criterion = BarDistribution(
+            borders=get_bucket_limits(num_buckets, full_range=(min_y, max_y))
+        )
+    elif loss_function == "adaptivebarnll":
+        borders = get_bucket_limits(
+            num_buckets, ys=get_y_sample(), full_range=(min_y, max_y)
+        )
+        criterion = BarDistribution(borders=borders)
+    elif loss_function == "adaptivefullsupportbarnll":
+        assert (
+            min_y is None and max_y is None
+        ), "Please do not specify `min_y` and `max_y` with `unboundedadaptivebarnll`."
+        borders = get_bucket_limits(num_buckets, ys=get_y_sample())
+        criterion = FullSupportBarDistribution(borders=borders)
+    else:
+        raise NotImplementedError(f"loss_function == {loss_function}.")
+    encoder = args.__dict__.pop("encoder")
+    y_encoder = args.__dict__.pop("y_encoder")
+    def get_encoder_generator(encoder):
+        if encoder == "linear":
+            encoder_generator = encoders.Linear
+        elif encoder == "mlp":
+            encoder_generator = encoders.MLP
+        elif encoder == "positional":
+            encoder_generator = encoders.Positional
+        else:
+            raise NotImplementedError(f"A {encoder} encoder is not valid.")
+        return encoder_generator
+    encoder_generator = get_encoder_generator(encoder)
+    y_encoder_generator = get_encoder_generator(y_encoder)
+    pos_encoder = args.__dict__.pop("pos_encoder")
+    if pos_encoder == "none":
+        pos_encoder_generator = None
+    elif pos_encoder == "sinus":
+        pos_encoder_generator = positional_encodings.PositionalEncoding
+    elif pos_encoder == "learned":
+        pos_encoder_generator = positional_encodings.LearnedPositionalEncoding
+    elif pos_encoder == "paired_scrambled_learned":
+        pos_encoder_generator = positional_encodings.PairedScrambledPositionalEncodings
+    else:
+        raise NotImplementedError(f"pos_encoer == {pos_encoder} is not valid.")
+    permutation_invariant_max_eval_pos = args.__dict__.pop(
+        "permutation_invariant_max_eval_pos"
+    )
+    permutation_invariant_sampling = args.__dict__.pop("permutation_invariant_sampling")
+    if permutation_invariant_max_eval_pos is not None:
+        if permutation_invariant_sampling == "weighted":
+            get_sampler = get_weighted_single_eval_pos_sampler
+        elif permutation_invariant_sampling == "uniform":
+            get_sampler = get_uniform_single_eval_pos_sampler
+        else:
+            raise ValueError()
+        args.__dict__["single_eval_pos_gen"] = get_sampler(
+            permutation_invariant_max_eval_pos
+        )
+    print("ARGS for `train`:", args.__dict__)
+    if args.__dict__["checkpoint_file"] is not None:
+        rootdir = os.path.dirname(os.path.realpath(__file__))
+        args.__dict__["checkpoint_file"] = os.path.join(
+            rootdir, args.__dict__["checkpoint_file"]
+        )
+        if os.path.exists(args.__dict__["checkpoint_file"]):
+            state_dicts = torch.load(args.__dict__["checkpoint_file"])
+            args.__dict__["load_weights_from_this_state_dict"] = state_dicts[
+                "model_state_dict"
+            ]
+            args.__dict__["load_optimizer_from_this_state_dict"] = state_dicts[
+                "optimizer_state_dict"
+            ]
+        else:
+            args.__dict__["load_weights_from_this_state_dict"] = None
+            args.__dict__["load_optimizer_from_this_state_dict"] = None
+    train(
+        prior,
+        criterion,
+        encoder_generator,
+        y_encoder_generator=y_encoder_generator,
+        pos_encoder_generator=pos_encoder_generator,
+        **args.__dict__,
+    )

lcpfn/train_lcpfn.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import math
+from torch import nn
+from lcpfn import bar_distribution, encoders, priors, train
+from lcpfn import utils
+def train_lcpfn(
+    get_batch_func,
+    seq_len: int = 100,
+    emsize: int = 512,
+    nlayers: int = 12,
+    num_borders: int = 1000,
+    lr: float = 0.001,
+    batch_size: int = 100,
+    epochs: int = 1000,
+):
+    """
+    Train a LCPFN model using the specified hyperparameters.
+    Args:
+        get_batch_func (callable): A function that returns a batch of learning curves.
+        seq_len (int, optional): The length of the input sequence. Defaults to 100.
+        emsize (int, optional): The size of the embedding layer. Defaults to 512.
+        nlayers (int, optional): The number of layers in the model. Defaults to 12.
+        num_borders_choices (int, optional): The number of borders to use. Defaults to 1000.
+        lr (float, optional): The learning rate for the optimizer. Defaults to 0.001.
+        batch_size (int, optional): The batch size for training. Defaults to 100.
+        epochs (int, optional): The number of epochs to train for. Defaults to 1000.
+    Returns:
+        torch.module: The trained model.
+    """
+    hps = {}
+    # PFN training hyperparameters
+    dataloader = priors.utils.get_batch_to_dataloader(get_batch_func)  # type: ignore
+    num_features = 1
+    ys = get_batch_func(
+        10_000,
+        seq_len,
+        num_features,
+        hyperparameters=hps,
+        single_eval_pos=seq_len,
+    )
+    bucket_limits = bar_distribution.get_bucket_limits(num_borders, ys=ys[2])
+    # Discretization of the predictive distributions
+    criterions = {
+        num_features: {
+            num_borders: bar_distribution.FullSupportBarDistribution(bucket_limits)
+        }
+    }
+    config = dict(
+        nlayers=nlayers,
+        priordataloader_class=dataloader,
+        criterion=criterions[num_features][num_borders],
+        encoder_generator=lambda in_dim, out_dim: nn.Sequential(
+            encoders.Normalize(0.0, 101.0),
+            encoders.Normalize(0.5, math.sqrt(1 / 12)),
+            encoders.Linear(in_dim, out_dim),
+        ),
+        emsize=emsize,
+        nhead=(emsize // 128),
+        warmup_epochs=(epochs // 4),
+        y_encoder_generator=encoders.get_normalized_uniform_encoder(encoders.Linear),
+        batch_size=batch_size,
+        scheduler=utils.get_cosine_schedule_with_warmup,
+        extra_prior_kwargs_dict={
+            # "num_workers": 10,
+            "num_features": num_features,
+            "hyperparameters": {
+                **hps,
+            },
+        },
+        epochs=epochs,
+        lr=lr,
+        bptt=seq_len,
+        single_eval_pos_gen=utils.get_uniform_single_eval_pos_sampler(seq_len, min_len=1),
+        aggregate_k_gradients=1,
+        nhid=(emsize * 2),
+        steps_per_epoch=100,
+        train_mixed_precision=False,
+    )
+    return train.train(**config)

lcpfn/transformer.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import Module, TransformerEncoder
+from lcpfn.layer import TransformerEncoderLayer, _get_activation_fn
+from lcpfn.utils import SeqBN, bool_mask_to_att_mask
+class TransformerModel(nn.Module):
+    def __init__(self, encoder, n_out, ninp, nhead, nhid, nlayers, dropout=0.0, style_encoder=None, y_encoder=None,
+                 pos_encoder=None, decoder=None, input_normalization=False, init_method=None, pre_norm=False,
+                 activation='gelu', recompute_attn=False, num_global_att_tokens=0, full_attention=False,
+                 all_layers_same_init=True):
+        super().__init__()
+        self.model_type = 'Transformer'
+        encoder_layer_creator = lambda: TransformerEncoderLayer(ninp, nhead, nhid, dropout, activation=activation,
+                                                                pre_norm=pre_norm, recompute_attn=recompute_attn)
+        self.transformer_encoder = TransformerEncoder(encoder_layer_creator(), nlayers)\
+            if all_layers_same_init else TransformerEncoderDiffInit(encoder_layer_creator, nlayers)
+        self.ninp = ninp
+        self.encoder = encoder
+        self.y_encoder = y_encoder
+        self.pos_encoder = pos_encoder
+        self.decoder = decoder(ninp, nhid, n_out) if decoder is not None else nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, n_out))
+        self.input_ln = SeqBN(ninp) if input_normalization else None
+        self.style_encoder = style_encoder
+        self.init_method = init_method
+        if num_global_att_tokens is not None:
+            assert not full_attention
+        self.global_att_embeddings = nn.Embedding(num_global_att_tokens, ninp) if num_global_att_tokens else None
+        self.full_attention = full_attention
+        self.n_out = n_out
+        self.nhid = nhid
+        self.init_weights()
+    @staticmethod
+    def generate_square_subsequent_mask(sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        return bool_mask_to_att_mask(mask)
+    @staticmethod
+    def generate_D_q_matrix(sz, query_size):
+        train_size = sz-query_size
+        mask = torch.zeros(sz,sz) == 0
+        mask[:,train_size:].zero_()
+        mask |= torch.eye(sz) == 1
+        return bool_mask_to_att_mask(mask)
+    @staticmethod
+    def generate_global_att_query_matrix(num_global_att_tokens, seq_len, num_query_tokens):
+        train_size = seq_len + num_global_att_tokens - num_query_tokens
+        sz = seq_len + num_global_att_tokens
+        mask = torch.zeros(num_query_tokens, sz) == 0
+        mask[:,train_size:].zero_()
+        mask[:,train_size:] |= torch.eye(num_query_tokens) == 1
+        return bool_mask_to_att_mask(mask)
+    @staticmethod
+    def generate_global_att_trainset_matrix(num_global_att_tokens, seq_len, num_query_tokens):
+        train_size = seq_len + num_global_att_tokens - num_query_tokens
+        trainset_size = seq_len - num_query_tokens
+        mask = torch.zeros(trainset_size, num_global_att_tokens) == 0
+        #mask[:,num_global_att_tokens:].zero_()
+        #mask[:,num_global_att_tokens:] |= torch.eye(trainset_size) == 1
+        return bool_mask_to_att_mask(mask)
+    @staticmethod
+    def generate_global_att_globaltokens_matrix(num_global_att_tokens, seq_len, num_query_tokens):
+        mask = torch.zeros(num_global_att_tokens, num_global_att_tokens+seq_len-num_query_tokens) == 0
+        return bool_mask_to_att_mask(mask)
+    def init_weights(self):
+        initrange = 1.
+        # if isinstance(self.encoder,EmbeddingEncoder):
+        #    self.encoder.weight.data.uniform_(-initrange, initrange)
+        # self.decoder.bias.data.zero_()
+        # self.decoder.weight.data.uniform_(-initrange, initrange)
+        if self.init_method is not None:
+            self.apply(self.init_method)
+        for layer in self.transformer_encoder.layers:
+            nn.init.zeros_(layer.linear2.weight)
+            nn.init.zeros_(layer.linear2.bias)
+            attns = layer.self_attn if isinstance(layer.self_attn, nn.ModuleList) else [layer.self_attn]
+            for attn in attns:
+                nn.init.zeros_(attn.out_proj.weight)
+                nn.init.zeros_(attn.out_proj.bias)
+    def forward(self, src, src_mask=None, single_eval_pos=None):
+        assert isinstance(src, tuple), 'inputs (src) have to be given as (x,y) or (style,x,y) tuple'
+        if len(src) == 2: # (x,y) and no style
+            src = (None,) + src
+        style_src, style_src_size = (src[0], (0 if (src[0] is None) else 1))
+        if src_mask is not None: assert self.global_att_embeddings is None or isinstance(src_mask, tuple)
+        if src_mask is None:
+            x_src = src[1]
+            if self.global_att_embeddings is None:
+                full_len = len(x_src) + style_src_size
+                if self.full_attention:
+                    src_mask = bool_mask_to_att_mask(torch.ones((full_len, full_len), dtype=torch.bool)).to(x_src.device)
+                else:
+                    src_mask = self.generate_D_q_matrix(len(x_src) + style_src_size, len(x_src) + style_src_size -single_eval_pos).to(x_src.device)
+            else:
+                src_mask_args = (self.global_att_embeddings.num_embeddings,
+                                 len(x_src) + style_src_size,
+                                 len(x_src) + style_src_size - single_eval_pos)
+                src_mask = (self.generate_global_att_globaltokens_matrix(*src_mask_args).to(x_src.device),
+                            self.generate_global_att_trainset_matrix(*src_mask_args).to(x_src.device),
+                            self.generate_global_att_query_matrix(*src_mask_args).to(x_src.device))
+        style_src, x_src, y_src = src
+        x_src = self.encoder(x_src)
+        y_src = self.y_encoder(y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src)
+        style_src = self.style_encoder(style_src).unsqueeze(0) if self.style_encoder else torch.tensor([], device=x_src.device)
+        global_src = torch.tensor([], device=x_src.device) if self.global_att_embeddings is None else \
+            self.global_att_embeddings.weight.unsqueeze(1).repeat(1, x_src.shape[1], 1)
+        train_x = x_src[:single_eval_pos] + y_src[:single_eval_pos]
+        src = torch.cat([global_src, style_src, train_x, x_src[single_eval_pos:]], 0)
+        if self.input_ln is not None:
+            src = self.input_ln(src)
+        if self.pos_encoder is not None:
+            src = self.pos_encoder(src)
+        # If we have style input, drop its output
+        output = self.transformer_encoder(src, src_mask)[style_src_size:]
+        output = self.decoder(output)
+        return output[single_eval_pos+(self.global_att_embeddings.num_embeddings if self.global_att_embeddings else 0):]
+    @torch.no_grad()
+    def init_from_small_model(self, small_model):
+        assert isinstance(self.decoder, nn.Linear) and isinstance(self.encoder, (nn.Linear, nn.Sequential)) \
+               and isinstance(self.y_encoder, (nn.Linear, nn.Sequential))
+        def set_encoder_weights(my_encoder, small_model_encoder):
+            my_encoder_linear, small_encoder_linear = (my_encoder, small_model_encoder) \
+                if isinstance(my_encoder, nn.Linear) else (my_encoder[-1], small_model_encoder[-1])
+            small_in_dim = small_encoder_linear.out_features
+            my_encoder_linear.weight.zero_()
+            my_encoder_linear.bias.zero_()
+            my_encoder_linear.weight[:small_in_dim] = small_encoder_linear.weight
+            my_encoder_linear.bias[:small_in_dim] = small_encoder_linear.bias
+        set_encoder_weights(self.encoder, small_model.encoder)
+        set_encoder_weights(self.y_encoder, small_model.y_encoder)
+        small_in_dim = small_model.decoder.in_features
+        self.decoder.weight[:, :small_in_dim] = small_model.decoder.weight
+        self.decoder.bias = small_model.decoder.bias
+        for my_layer, small_layer in zip(self.transformer_encoder.layers, small_model.transformer_encoder.layers):
+            small_hid_dim = small_layer.linear1.out_features
+            my_in_dim = my_layer.linear1.in_features
+            # packed along q,k,v order in first dim
+            my_in_proj_w = my_layer.self_attn.in_proj_weight
+            small_in_proj_w = small_layer.self_attn.in_proj_weight
+            my_in_proj_w.view(3, my_in_dim, my_in_dim)[:, :small_in_dim, :small_in_dim] = small_in_proj_w.view(3,
+                                                                                                               small_in_dim,
+                                                                                                               small_in_dim)
+            my_layer.self_attn.in_proj_bias.view(3, my_in_dim)[:,
+            :small_in_dim] = small_layer.self_attn.in_proj_bias.view(3, small_in_dim)
+            my_layer.self_attn.out_proj.weight[:small_in_dim, :small_in_dim] = small_layer.self_attn.out_proj.weight
+            my_layer.self_attn.out_proj.bias[:small_in_dim] = small_layer.self_attn.out_proj.bias
+            my_layer.linear1.weight[:small_hid_dim, :small_in_dim] = small_layer.linear1.weight
+            my_layer.linear1.bias[:small_hid_dim] = small_layer.linear1.bias
+            my_layer.linear2.weight[:small_in_dim, :small_hid_dim] = small_layer.linear2.weight
+            my_layer.linear2.bias[:small_in_dim] = small_layer.linear2.bias
+            my_layer.norm1.weight[:small_in_dim] = math.sqrt(small_in_dim / my_in_dim) * small_layer.norm1.weight
+            my_layer.norm2.weight[:small_in_dim] = math.sqrt(small_in_dim / my_in_dim) * small_layer.norm2.weight
+            my_layer.norm1.bias[:small_in_dim] = small_layer.norm1.bias
+            my_layer.norm2.bias[:small_in_dim] = small_layer.norm2.bias
+class TransformerEncoderDiffInit(Module):
+    r"""TransformerEncoder is a stack of N encoder layers
+    Args:
+        encoder_layer_creator: a function generating objects of TransformerEncoderLayer class without args (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+    """
+    __constants__ = ['norm']
+    def __init__(self, encoder_layer_creator, num_layers, norm=None):
+        super().__init__()
+        self.layers = nn.ModuleList([encoder_layer_creator() for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        output = src
+        for mod in self.layers:
+            output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output

lcpfn/utils.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import os
+import math
+import argparse
+import random
+import datetime
+import torch
+from torch import nn
+from torch.optim.lr_scheduler import LambdaLR
+import numpy as np
+# copied from huggingface
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases following the
+    values of the cosine function between 0 and `pi * cycles` after a warmup
+    period during which it increases linearly between 0 and 1.
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+# copied from huggingface
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+    Args:
+        optimizer (:class:`~torch.optim.Optimizer`):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (:obj:`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (:obj:`int`):
+            The total number of training steps.
+        last_epoch (:obj:`int`, `optional`, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def get_openai_lr(transformer_model):
+    num_params = sum(p.numel() for p in transformer_model.parameters())
+    return 0.003239 - 0.0001395 * math.log(num_params)
+def get_weighted_single_eval_pos_sampler(max_len):
+    """
+    This gives a sampler that can be used for `single_eval_pos` which yields good performance for all positions p,
+    where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer.
+    :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
+    """
+    return lambda: random.choices(range(max_len), [1 / (max_len - i) for i in range(max_len)])[0]
+def get_uniform_single_eval_pos_sampler(max_len, min_len=0):
+    """
+    Just sample any evaluation position with the same weight
+    :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
+    """
+    return lambda: random.choices(range(min_len, max_len))[0]
+class SeqBN(nn.Module):
+    def __init__(self, d_model):
+        super().__init__()
+        self.bn = nn.BatchNorm1d(d_model)
+        self.d_model = d_model
+    def forward(self, x):
+        assert self.d_model == x.shape[-1]
+        flat_x = x.view(-1, self.d_model)
+        flat_x = self.bn(flat_x)
+        return flat_x.view(*x.shape)
+def set_locals_in_self(locals):
+    """
+    Call this function like `set_locals_in_self(locals())` to set all local variables as object variables.
+    Especially useful right at the beginning of `__init__`.
+    :param locals: `locals()`
+    """
+    self = locals['self']
+    for var_name, val in locals.items():
+        if var_name != 'self': setattr(self, var_name, val)
+default_device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
+# Copied from StackOverflow, but we do an eval on the values additionally
+class StoreDictKeyPair(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        self._nargs = nargs
+        super(StoreDictKeyPair, self).__init__(option_strings, dest, nargs=nargs, **kwargs)
+    def __call__(self, parser, namespace, values, option_string=None):
+        my_dict = {}
+        for kv in values:
+            k, v = kv.split("=")
+            try:
+                my_dict[k] = eval(v)
+            except NameError:
+                my_dict[k] = v
+        setattr(namespace, self.dest, my_dict)
+        print("dict values: {}".format(my_dict))
+def get_nan_value(v, set_value_to_nan=0.0):
+    if random.random() < set_value_to_nan:
+        return v
+    else:
+        return random.choice([-999, 0, 1, 999])
+def to_ranking(data):
+    x = (data >= data.unsqueeze(-3))
+    x = x.sum(0)
+    return x
+# TODO: Is there a better way to do this?
+#   1. Cmparing to unique elements: When all values are different we still get quadratic blowup
+#   2. Argsort(Argsort()) returns ranking, but with duplicate values there is an ordering which is problematic
+#   3. Argsort(Argsort(Unique))->Scatter seems a bit complicated, doesn't have quadratic blowup, but how fast?
+def to_ranking_low_mem(data):
+    x = torch.zeros_like(data)
+    for col in range(data.shape[-1]):
+        x_ = (data[:, :, col] >= data[:, :, col].unsqueeze(-2))
+        x_ = x_.sum(0)
+        x[:, :, col] = x_
+    return x
+def nan_handling_missing_for_unknown_reason_value(set_value_to_nan=0.0):
+    return get_nan_value(float('nan'), set_value_to_nan)
+def nan_handling_missing_for_no_reason_value(set_value_to_nan=0.0):
+    return get_nan_value(float('-inf'), set_value_to_nan)
+def nan_handling_missing_for_a_reason_value(set_value_to_nan=0.0):
+    return get_nan_value(float('inf'), set_value_to_nan)
+def torch_nanmean(x, axis=0):
+    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(axis=axis)
+    value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
+    return value / num
+def torch_nanstd(x, axis=0):
+    num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(axis=axis)
+    value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
+    mean = value / num
+    mean_broadcast = torch.repeat_interleave(mean.unsqueeze(axis), x.shape[axis], dim=axis)
+    return torch.sqrt(torch.nansum(torch.square(mean_broadcast - x), axis=axis) / (num - 1))
+def normalize_data(data, normalize_positions=-1):
+    if normalize_positions > 0:
+        mean = torch_nanmean(data[:normalize_positions], axis=0)
+        std = torch_nanstd(data[:normalize_positions], axis=0) + .000001
+    else:
+        mean = torch_nanmean(data, axis=0)
+        std = torch_nanstd(data, axis=0) + .000001
+    data = (data - mean) / std
+    data = torch.clip(data, min=-100, max=100)
+    return data
+def remove_outliers(X, n_sigma=4):
+    # Expects T, B, H
+    assert len(X.shape) == 3, "X must be T,B,H"
+    #for b in range(X.shape[1]):
+        #for col in range(X.shape[2]):
+    data = X
+    data_mean, data_std = torch_nanmean(data, axis=0), torch_nanstd(data, axis=0)
+    cut_off = data_std * n_sigma
+    lower, upper = data_mean - cut_off, data_mean + cut_off
+    data_clean = X[:].clone()
+    data_clean[torch.logical_or(data > upper, data < lower)] = np.nan
+    data_mean, data_std = torch_nanmean(data_clean, axis=0), torch_nanstd(data_clean, axis=0)
+    cut_off = data_std * n_sigma
+    lower, upper = data_mean - cut_off, data_mean + cut_off
+    X = torch.maximum(-torch.log(1+torch.abs(X)) + lower, X)
+    X = torch.minimum(torch.log(1+torch.abs(X)) + upper, X)
+            # print(ds[1][data < lower, col], ds[1][data > upper, col], ds[1][~np.isnan(data), col].shape, data_mean, data_std)
+    return X
+def bool_mask_to_att_mask(mask):
+    return mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+def print_on_master_only(is_master):
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def init_dist(device):
+    print('init dist')
+    if 'LOCAL_RANK' in os.environ:
+        # launched with torch.distributed.launch
+        rank = int(os.environ["LOCAL_RANK"])
+        print('torch.distributed.launch and my rank is', rank)
+        torch.cuda.set_device(rank)
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://", timeout=datetime.timedelta(seconds=20),
+                                             world_size=torch.cuda.device_count(), rank=rank)
+        torch.distributed.barrier()
+        print_on_master_only(rank == 0)
+        print(f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
+              "only I can print, but when using print(..., force=True) it will print on all ranks.")
+        return True, rank, f'cuda:{rank}'
+    elif 'SLURM_PROCID' in os.environ and torch.cuda.device_count() > 1:
+        # this is for multi gpu when starting with submitit
+        assert device != 'cpu:0'
+        rank = int(os.environ['SLURM_PROCID'])
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12355'
+        torch.cuda.set_device(rank)
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
+        print('distributed submitit launch and my rank is', rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://", timeout=datetime.timedelta(seconds=20),
+                                             world_size=torch.cuda.device_count(), rank=rank)
+        torch.distributed.barrier()
+        print_on_master_only(rank == 0)
+        print(f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
+              "only I can print, but when using print(..., force=True) it will print on all ranks.")
+        return True, rank, f'cuda:{rank}'
+    else:
+        print('Not using distributed')
+        # will not change any of the behavior of print, but allows putting the force=True in the print calls
+        print_on_master_only(True)
+        return False, 0, device
+def check_compatibility(dl):
+    if hasattr(dl, 'num_outputs'):
+        print('`num_outputs` for the DataLoader is deprecated. It is assumed to be 1 from now on.')
+        assert dl.num_outputs != 1, "We assume num_outputs to be 1. Instead of the num_ouputs change your loss." \
+                                    "We specify the number of classes in the CE loss."

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch==1.11.0
+numpy>=1.21.2
+# lcpfn @ git+https://github.com/automl/lcpfn.git