In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import os
import time

import torch

import numpy as np

import matplotlib.pyplot as plt

from model_builder import get_model, get_default_spec, save_model, load_model

from scripts.model_configs import *

# Setting params

In [6]:
device = 'cuda'
base_path = os.path.join('.')

In [7]:
def train_function(config_sample, i, add_name=''):
 start_time = time.time()
 N_epochs_to_save = 50
 
 def save_callback(model, epoch):
 if not hasattr(model, 'last_saved_epoch'):
 model.last_saved_epoch = 0
 if ((time.time() - start_time) / (maximum_runtime * 60 / N_epochs_to_save)) > model.last_saved_epoch:
 print('Saving model..')
 config_sample['epoch_in_training'] = epoch
 save_model(model, base_path, f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{model.last_saved_epoch}.cpkt',
 config_sample)
 model.last_saved_epoch = model.last_saved_epoch + 1 # TODO: Rename to checkpoint
 
 model = get_model(config_sample
 , device
 , should_train=True
 , verbose=1
 , epoch_callback = save_callback)
 
 return

# Check synthetic data fitting

#### Workflow functions

In [8]:
def generate_test_data(test_gp_params):
 # Generate test data
 config = {**test_gp_params}

 config['verbose'] = False
 config['differentiable'] = False
 #config['bptt'] = config['bptt_in_training']

 model_test_data = get_model(config, device, should_train=False, verbose=True)
 (hp_embedding, data, targets_), targets = next(iter(model_test_data[3]))
 (hp_embedding, data, targets_), targets = (hp_embedding, data.to(device), targets_.to(device)), targets.to(device)
 
 return (hp_embedding, data, targets_), targets

def evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size):
 losses, hparams = [], []
 for l in np.arange(-1.74, 1.74, plot_step_size):
 hparam = [*hparam_true]
 hparam[vary_hparam_ind] = l
 hp_embedding_used = torch.tensor(hparam).to(device).float()
 with torch.inference_mode():
 outputs = torch.sigmoid(model[2]((hp_embedding_used.repeat(data.shape[1], 1), data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)
 
 loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten()).detach().cpu()
 losses += [loss]
 hparam_real = [diff_hparams_f[i][1](hp) for i, hp in enumerate(hparam)]
 hparams += [hparam_real]
 
 print(loss, hparam_real, hparam, outputs.shape)
 return np.array(losses), np.array(hparams)

In [9]:
def differentiable_hparam_tuning_workflow(config_sample, hparam_label, batch_size=4, N_grad_steps=50, plot_step_size=0.1):
 test_gp_params = {
 "lengthscale": 1.0,
 #"lengthscale_mean": true_lengthscale,
 #"lengthscale_std": 0.5,
 "noise": 0.2,
 "outputscale": 1.0,
 'batch_size': batch_size
 }
 config_sample.update(test_gp_params)
 (hp_embedding, data, targets_), targets = generate_test_data(config_sample)
 hparam_true = [diff_hparams_f[i][0](test_gp_params[hp]) for i, hp in enumerate(diff_hparams_keys)]
 #hparam_true = [test_gp_params[hp] for i, hp in enumerate(diff_hparams_keys)]

 for vary_hparam_ind, vary_hparam_name in hparam_label:
 print(vary_hparam_name)

 losses, hparams = evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size=plot_step_size)

 # TODO: Make only one parameter diffable
 hparam = torch.tensor([*hparam_true]).to(device).float()
 hparam[vary_hparam_ind] = hparam[vary_hparam_ind] + 0.1 #random.random() * 2 - 1
 hparam = torch.nn.Parameter(hparam, requires_grad=True)
 hparam_grad_mask = torch.zeros_like(hparam)
 hparam_grad_mask[vary_hparam_ind] = 1

 optimizer = torch.optim.Adam([hparam], lr=0.1)
 
 for t in range(N_grad_steps):
 style = hparam.repeat(data.shape[1], 1)
 outputs = torch.sigmoid(model[2]((style, data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)
 loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten())
 optimizer.zero_grad()
 loss.backward()
 with torch.no_grad():
 hparam.grad *= hparam_grad_mask
 optimizer.step()
 print('loss:', loss, 'hparams', diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind]), 'true', diff_hparams_f[vary_hparam_ind][1](hparam_true[vary_hparam_ind]))
 inferred_param = diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind].cpu().detach().numpy())
 return hparams, losses, inferred_param, vary_hparam_ind, hparam_true
 

#### Fitting a PFN with HP-Diffable GP Prior

In [10]:
num_features = 5
bptt = 200
eval_positions = [100]

config_general = get_general_config(num_features, bptt, eval_positions)
config_flexible_categorical = get_flexible_categorical_config(num_features)

config_gp = {'noise': 0.2, "lengthscale": 1.0, "outputscale": 1.0}
config_diff_gp = {'differentiable_hyperparameters': {
 'outputscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},
 'lengthscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},
 'noise': {'distribution': 'uniform', 'min': 0.0000001, 'max': 0.5},
 }
}

config = {**config_general, **config_flexible_categorical, **config_diff_gp, **config_gp}

config['prior_type'], config['differentiable'], config['flexible'] = 'gp', True, True
config['num_features'], config['num_features_used'] = num_features, num_features
config['epochs'], config['num_steps'], config['verbose'] = 500, 100, False
config["lr"] = 0.00001
config["dropout"] = 0
config["emsize"] = 512
config["batch_size"] = 128
config["aggregate_k_gradients"] = 1
config['set_value_to_nan'] = 0.0
config['output_multiclass_ordered_p'] = 1.0
config['categorical_feature_p'] = 0.0
config['nan_prob_a_reason'] = 0.0
config['nan_prob_no_reason'] = 0.0
config['nan_prob_unknown_reason'] = 0.0
config["nlayers"] = 8

# TODO: This should not be sampled, but be one config
# TODO: This uses old hyperparam sampler throws error
config_sample = evaluate_hypers(config)

In [11]:
device = 'cuda'
train_function(config_sample, 0, add_name='gp_experiments_diff_with_noise_no_meta_new')

Using style prior: True
Using cpu:0 device
Not using distributed
DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 128, 'seq_len': 200, 'seq_len_maximum': 200, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 128, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 200, 'eval_positions': None, 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': 5, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.

#### Evaluating a PFN (with pretrained model)

In [13]:
device = 'cpu'
model, c = load_model(base_path, f'models_diff/gp_ablation_model.cpkt', device, eval_positions, verbose=False)

Using style prior: True
Using cpu:0 device
Not using distributed
DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 1, 'seq_len': 10, 'seq_len_maximum': 10, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 1, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 10, 'eval_positions': [190], 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'output_multiclass_ordered_p': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'multiclass_type': 'rank', 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': . at 0x7f39ad8534

In [14]:
from priors.differentiable_prior import DifferentiableHyperparameterList
diff_list = DifferentiableHyperparameterList(c['differentiable_hyperparameters'], 512, device)
diff_hparams_keys, diff_hparams_f = diff_list.get_hyperparameter_info()

In [None]:
model[2].eval()
eval_pos = 100

hparam_label = [(1, 'outputscale')]
hparam_label = [(0, 'lengthscale')]
hparam_label = [(2, 'noise')]
hparam_labels = [[(1, 'outputscale')], [(2, 'noise')], [(0, 'lengthscale')]]
#hparam_labels = [[(2, 'noise')]]

hparams, losses, inferred_param, vary_hparam_ind, hparam_true = {}, {}, {}, {}, {}

for hparam_label in hparam_labels:
 (hparams[hparam_label[0][1]], losses[hparam_label[0][1]], inferred_param[hparam_label[0][1]], vary_hparam_ind[hparam_label[0][1]], 
 hparam_true[hparam_label[0][1]]) = differentiable_hparam_tuning_workflow(config_sample, 
 hparam_label=hparam_label, 
 batch_size=256, 
 N_grad_steps=50,
 plot_step_size=0.05)


In [None]:
label = 'lengthscale'

#import tikzplotlib

inferred = losses[label]

plt.plot(hparams[label][:, vary_hparam_ind[label]], losses[label])
true = diff_hparams_f[vary_hparam_ind[label]][1](hparam_true[label][vary_hparam_ind[label]])
plt.axvline(x=inferred_param[label], linestyle='solid', color='red')
plt.axvline(x=true, linestyle='dashed')

plt.ylabel('Cross entropy Loss')
plt.xlabel(label)

#tikzplotlib.save(f'diff_inferred_params_{label}.tex', axis_height='5.2cm', axis_width='5.2cm', strict=True)

plt.show()