{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import time\n", "\n", "import torch\n", "\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "from model_builder import get_model, get_default_spec, save_model, load_model\n", "\n", "from scripts.model_configs import *" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Setting params" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "device = 'cuda'\n", "base_path = os.path.join('.')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def train_function(config_sample, i, add_name=''):\n", " start_time = time.time()\n", " N_epochs_to_save = 50\n", " \n", " def save_callback(model, epoch):\n", " if not hasattr(model, 'last_saved_epoch'):\n", " model.last_saved_epoch = 0\n", " if ((time.time() - start_time) / (maximum_runtime * 60 / N_epochs_to_save)) > model.last_saved_epoch:\n", " print('Saving model..')\n", " config_sample['epoch_in_training'] = epoch\n", " save_model(model, base_path, f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{model.last_saved_epoch}.cpkt',\n", " config_sample)\n", " model.last_saved_epoch = model.last_saved_epoch + 1 # TODO: Rename to checkpoint\n", " \n", " model = get_model(config_sample\n", " , device\n", " , should_train=True\n", " , verbose=1\n", " , epoch_callback = save_callback)\n", " \n", " return" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "tags": [] }, "source": [ "# Check synthetic data fitting" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### Workflow functions" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "hidden": true, "tags": [] }, "outputs": [], "source": [ "def generate_test_data(test_gp_params):\n", " # Generate test data\n", " config = {**test_gp_params}\n", "\n", " config['verbose'] = False\n", " config['differentiable'] = False\n", " #config['bptt'] = config['bptt_in_training']\n", "\n", " model_test_data = get_model(config, device, should_train=False, verbose=True)\n", " (hp_embedding, data, targets_), targets = next(iter(model_test_data[3]))\n", " (hp_embedding, data, targets_), targets = (hp_embedding, data.to(device), targets_.to(device)), targets.to(device)\n", " \n", " return (hp_embedding, data, targets_), targets\n", "\n", "def evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size):\n", " losses, hparams = [], []\n", " for l in np.arange(-1.74, 1.74, plot_step_size):\n", " hparam = [*hparam_true]\n", " hparam[vary_hparam_ind] = l\n", " hp_embedding_used = torch.tensor(hparam).to(device).float()\n", " with torch.inference_mode():\n", " outputs = torch.sigmoid(model[2]((hp_embedding_used.repeat(data.shape[1], 1), data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)\n", " \n", " loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten()).detach().cpu()\n", " losses += [loss]\n", " hparam_real = [diff_hparams_f[i][1](hp) for i, hp in enumerate(hparam)]\n", " hparams += [hparam_real]\n", " \n", " print(loss, hparam_real, hparam, outputs.shape)\n", " return np.array(losses), np.array(hparams)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def differentiable_hparam_tuning_workflow(config_sample, hparam_label, batch_size=4, N_grad_steps=50, plot_step_size=0.1):\n", " test_gp_params = {\n", " \"lengthscale\": 1.0,\n", " #\"lengthscale_mean\": true_lengthscale,\n", " #\"lengthscale_std\": 0.5,\n", " \"noise\": 0.2,\n", " \"outputscale\": 1.0,\n", " 'batch_size': batch_size\n", " }\n", " config_sample.update(test_gp_params)\n", " (hp_embedding, data, targets_), targets = generate_test_data(config_sample)\n", " hparam_true = [diff_hparams_f[i][0](test_gp_params[hp]) for i, hp in enumerate(diff_hparams_keys)]\n", " #hparam_true = [test_gp_params[hp] for i, hp in enumerate(diff_hparams_keys)]\n", "\n", " for vary_hparam_ind, vary_hparam_name in hparam_label:\n", " print(vary_hparam_name)\n", "\n", " losses, hparams = evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size=plot_step_size)\n", "\n", " # TODO: Make only one parameter diffable\n", " hparam = torch.tensor([*hparam_true]).to(device).float()\n", " hparam[vary_hparam_ind] = hparam[vary_hparam_ind] + 0.1 #random.random() * 2 - 1\n", " hparam = torch.nn.Parameter(hparam, requires_grad=True)\n", " hparam_grad_mask = torch.zeros_like(hparam)\n", " hparam_grad_mask[vary_hparam_ind] = 1\n", "\n", " optimizer = torch.optim.Adam([hparam], lr=0.1)\n", " \n", " for t in range(N_grad_steps):\n", " style = hparam.repeat(data.shape[1], 1)\n", " outputs = torch.sigmoid(model[2]((style, data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)\n", " loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten())\n", " optimizer.zero_grad()\n", " loss.backward()\n", " with torch.no_grad():\n", " hparam.grad *= hparam_grad_mask\n", " optimizer.step()\n", " print('loss:', loss, 'hparams', diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind]), 'true', diff_hparams_f[vary_hparam_ind][1](hparam_true[vary_hparam_ind]))\n", " inferred_param = diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind].cpu().detach().numpy())\n", " return hparams, losses, inferred_param, vary_hparam_ind, hparam_true\n", " " ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### Fitting a PFN with HP-Diffable GP Prior" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "hidden": true, "tags": [] }, "outputs": [], "source": [ "num_features = 5\n", "bptt = 200\n", "eval_positions = [100]\n", "\n", "config_general = get_general_config(num_features, bptt, eval_positions)\n", "config_flexible_categorical = get_flexible_categorical_config(num_features)\n", "\n", "config_gp = {'noise': 0.2, \"lengthscale\": 1.0, \"outputscale\": 1.0}\n", "config_diff_gp = {'differentiable_hyperparameters': {\n", " 'outputscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},\n", " 'lengthscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},\n", " 'noise': {'distribution': 'uniform', 'min': 0.0000001, 'max': 0.5},\n", " }\n", "}\n", "\n", "config = {**config_general, **config_flexible_categorical, **config_diff_gp, **config_gp}\n", "\n", "config['prior_type'], config['differentiable'], config['flexible'] = 'gp', True, True\n", "config['num_features'], config['num_features_used'] = num_features, num_features\n", "config['epochs'], config['num_steps'], config['verbose'] = 500, 100, False\n", "config[\"lr\"] = 0.00001\n", "config[\"dropout\"] = 0\n", "config[\"emsize\"] = 512\n", "config[\"batch_size\"] = 128\n", "config[\"aggregate_k_gradients\"] = 1\n", "config['set_value_to_nan'] = 0.0\n", "config['output_multiclass_ordered_p'] = 1.0\n", "config['categorical_feature_p'] = 0.0\n", "config['nan_prob_a_reason'] = 0.0\n", "config['nan_prob_no_reason'] = 0.0\n", "config['nan_prob_unknown_reason'] = 0.0\n", "config[\"nlayers\"] = 8\n", "\n", "# TODO: This should not be sampled, but be one config\n", "# TODO: This uses old hyperparam sampler throws error\n", "config_sample = evaluate_hypers(config)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "hidden": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using style prior: True\n", "Using cpu:0 device\n", "Not using distributed\n", "DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 128, 'seq_len': 200, 'seq_len_maximum': 200, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 128, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 200, 'eval_positions': None, 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': 5, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': 0.2, 'lengthscale': 1.0, 'outputscale': 1.0, 'prior_type': 'gp', 'differentiable': True, 'flexible': True, 'aggregate_k_gradients': 1, 'output_multiclass_ordered_p': 1.0, 'recompute_attn': False}, 'num_outputs': 1, 'dynamic_batch_size': 2, 'get_batch': .make_get_batch.. at 0x7f39ad8dcf80>, 'differentiable_hyperparameters': {'outputscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'lengthscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': {'distribution': 'uniform', 'min': 1e-07, 'max': 0.5}}}, 'num_features': 5, 'num_outputs': 1}\n", "Using a Transformer with 17.35 M parameters\n" ] } ], "source": [ "device = 'cuda'\n", "train_function(config_sample, 0, add_name='gp_experiments_diff_with_noise_no_meta_new')" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "#### Evaluating a PFN (with pretrained model)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "hidden": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using style prior: True\n", "Using cpu:0 device\n", "Not using distributed\n", "DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 1, 'seq_len': 10, 'seq_len_maximum': 10, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 1, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 10, 'eval_positions': [190], 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'output_multiclass_ordered_p': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'multiclass_type': 'rank', 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': . at 0x7f39ad8534d0>, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': 0.03, 'lengthscale': 1.0, 'outputscale': 1.0, 'prior_type': 'gp', 'differentiable': True, 'flexible': True, 'aggregate_k_gradients': 1, 'recompute_attn': False, 'bptt_extra_samples': None, 'epoch_in_training': 0.998, 'categorical_features_sampler': . at 0x7f39ad853680>, 'num_features_used_in_training': 5, 'num_classes_in_training': 2, 'batch_size_in_training': 128, 'bptt_in_training': 200, 'bptt_extra_samples_in_training': None}, 'num_outputs': 1, 'dynamic_batch_size': 2, 'get_batch': .make_get_batch.. at 0x7f39ad81ab90>, 'differentiable_hyperparameters': {'outputscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'lengthscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': {'distribution': 'uniform', 'min': 1e-07, 'max': 0.5}}}, 'num_features': 5, 'num_outputs': 1}\n", "Using a Transformer with 17.35 M parameters\n" ] } ], "source": [ "device = 'cpu'\n", "model, c = load_model(base_path, f'models_diff/gp_ablation_model.cpkt', device, eval_positions, verbose=False)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from priors.differentiable_prior import DifferentiableHyperparameterList\n", "diff_list = DifferentiableHyperparameterList(c['differentiable_hyperparameters'], 512, device)\n", "diff_hparams_keys, diff_hparams_f = diff_list.get_hyperparameter_info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "model[2].eval()\n", "eval_pos = 100\n", "\n", "hparam_label = [(1, 'outputscale')]\n", "hparam_label = [(0, 'lengthscale')]\n", "hparam_label = [(2, 'noise')]\n", "hparam_labels = [[(1, 'outputscale')], [(2, 'noise')], [(0, 'lengthscale')]]\n", "#hparam_labels = [[(2, 'noise')]]\n", "\n", "hparams, losses, inferred_param, vary_hparam_ind, hparam_true = {}, {}, {}, {}, {}\n", "\n", "for hparam_label in hparam_labels:\n", " (hparams[hparam_label[0][1]], losses[hparam_label[0][1]], inferred_param[hparam_label[0][1]], vary_hparam_ind[hparam_label[0][1]], \n", " hparam_true[hparam_label[0][1]]) = differentiable_hparam_tuning_workflow(config_sample, \n", " hparam_label=hparam_label, \n", " batch_size=256, \n", " N_grad_steps=50,\n", " plot_step_size=0.05)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "label = 'lengthscale'\n", "\n", "#import tikzplotlib\n", "\n", "inferred = losses[label]\n", "\n", "plt.plot(hparams[label][:, vary_hparam_ind[label]], losses[label])\n", "true = diff_hparams_f[vary_hparam_ind[label]][1](hparam_true[label][vary_hparam_ind[label]])\n", "plt.axvline(x=inferred_param[label], linestyle='solid', color='red')\n", "plt.axvline(x=true, linestyle='dashed')\n", "\n", "plt.ylabel('Cross entropy Loss')\n", "plt.xlabel(label)\n", "\n", "#tikzplotlib.save(f'diff_inferred_params_{label}.tex', axis_height='5.2cm', axis_width='5.2cm', strict=True)\n", "\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" } }, "nbformat": 4, "nbformat_minor": 4 }