{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "MNIST DP - Model Grid", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "ddMfyZma6xFi" }, "source": [ "import tensorflow as tf\n", "\n", "tf.compat.v1.disable_v2_behavior()\n", "tf.get_logger().setLevel('ERROR')\n", "\n", "import json\n", "import math\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SmrgFLqK7W8D" }, "source": [ "!pip install tensorflow_privacy\n", "\n", "from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy\n", "from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EFlKcATk7ajD" }, "source": [ "train, test = tf.keras.datasets.mnist.load_data()\n", "train_data, train_labels = train\n", "test_data, test_labels = test\n", "\n", "train_data = np.array(train_data, dtype=np.float32) / 255\n", "test_data = np.array(test_data, dtype=np.float32) / 255\n", "\n", "train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)\n", "test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)\n", "\n", "train_labels = np.array(train_labels, dtype=np.int32)\n", "test_labels = np.array(test_labels, dtype=np.int32)\n", "\n", "train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=10)\n", "test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=10)\n", "\n", "assert train_data.min() == 0.\n", "assert train_data.max() == 1.\n", "assert test_data.min() == 0.\n", "assert test_data.max() == 1." ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pBq8XXp2eMhz" }, "source": [ "!mkdir -p model-grid\n", "rootdir = 'model-grid/'\n", "batch_size = 250" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "5Zup2D8dDNSs" }, "source": [ "model_grid = []\n", "\n", "dataset_sizes = [math.ceil(60000/d/batch_size)*batch_size for d in [1, 2, 4, 8, 16, 32, 64]]\n", "aVals = [1/32, 1/16, 1/8, .25, .5, 1, 2, 4, 8, 16, 32]\n", "\n", "for dataset_size in dataset_sizes:\n", " for aVal in aVals:\n", " model_grid.append({\n", " 'dataset_size': dataset_size,\n", " 'aVal': aVal,\n", " 'l2_norm_clip': 1.5*aVal,\n", " 'noise_multiplier': 1.3/aVal,\n", " 'epochs': 4,\n", " })\n", "\n", "for m in model_grid:\n", " m['slug'] = 'grid__datasetsize_' + str(m['dataset_size']) + '__l2_norm_clip_' + str(m['l2_norm_clip']) + '__noise_multiplier_' + str(m['noise_multiplier']) + '__epochs_' + str(m['epochs'])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "luNJq3H0KssP" }, "source": [ "def calc_model(m):\n", " path = rootdir + m['slug']\n", "\n", " ## skip models with existing test_predictions\n", " try:\n", " test_path = path + '___test_predictions.npy'\n", " print(test_path)\n", " with open(test_path, 'r') as fh:\n", " return\n", " except Exception as e:\n", " print('no cache, training')\n", " print(m)\n", "\n", " batch_size = 250\n", " learning_rate = 0.25\n", "\n", " if batch_size % num_microbatches != 0:\n", " raise ValueError('Batch size should be an integer multiple of the number of microbatches')\n", "\n", " model = tf.keras.Sequential([\n", " tf.keras.layers.Conv2D(16, 8,\n", " strides=2,\n", " padding='same',\n", " activation='relu',\n", " input_shape=(28, 28, 1)),\n", " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Conv2D(32, 4,\n", " strides=2,\n", " padding='valid',\n", " activation='relu'),\n", " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Flatten(),\n", " tf.keras.layers.Dense(32, activation='relu'),\n", " tf.keras.layers.Dense(10)\n", " ])\n", "\n", " optimizer = DPKerasSGDOptimizer(\n", " l2_norm_clip=m['l2_norm_clip'],\n", " noise_multiplier=m['noise_multiplier'],\n", " num_microbatches=num_microbatches,\n", " learning_rate=learning_rate)\n", "\n", " loss = tf.keras.losses.CategoricalCrossentropy(\n", " from_logits=True, reduction=tf.losses.Reduction.NONE)\n", " \n", " model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])\n", "\n", " model.fit(train_data[0: m['dataset_size']], train_labels[0: m['dataset_size']],\n", " epochs=m['epochs'],\n", " validation_data=(test_data, test_labels),\n", " batch_size=batch_size)\n", " \n", " predictions = model.predict(train_data)\n", " percents = tf.compat.v2.nn.softmax(predictions)\n", " percents = percents.eval(session=tf.compat.v1.Session())\n", " with open(path + '___train_predictions.npy', 'w') as fh:\n", " np.save(fh, percents)\n", "\n", " predictions = model.predict(test_data)\n", " percents = tf.compat.v2.nn.softmax(predictions)\n", " percents = percents.eval(session=tf.compat.v1.Session())\n", " with open(path + '___test_predictions.npy', 'w') as fh:\n", " np.save(fh, percents)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "XUhyFcExSCVs" }, "source": [ "# for m in model_grid:\n", "# calc_model(m)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "h5deGSGxJQXn" }, "source": [ "for m in model_grid:\n", " eps, delta = compute_dp_sgd_privacy.compute_dp_sgd_privacy(\n", " n=m['dataset_size'],\n", " batch_size=batch_size,\n", " noise_multiplier=m['noise_multiplier'],\n", " epochs=m['epochs'],\n", " delta=1e-5)\n", " \n", " m['epsilon'] = eps\n", " with open(rootdir + m['slug'] + '___metadata.json', 'w') as fh:\n", " json.dump(m, fh)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "zvFMboYi5As1" }, "source": [ "# Save overall test and train accuracy stats" ] }, { "cell_type": "code", "metadata": { "id": "ZF2KOcWPuz0e" }, "source": [ "for m in model_grid: \n", " with open(rootdir + m['slug'] + '___test_predictions.npy', 'r') as fh:\n", " test_logits = np.load(fh)\n", "\n", " test_predictions = np.argmax(test_logits, axis=1)\n", " labels = np.argmax(test_labels, axis=1)\n", " test_prediction_diff = test_predictions - labels\n", "\n", " np.count_nonzero(test_prediction_diff)\n", " m['accuracy'] = 1 - np.count_nonzero(test_prediction_diff)/10000\n", " for i in range(10):\n", " indices = np.where(labels == i)\n", " accuracy = 1 - np.count_nonzero(test_prediction_diff[indices])/indices[0].size\n", " m['accuracy_' + str(i)] = accuracy\n", "\n", "with open(rootdir + 'model_grid_test_accuracy.json', 'w') as fh:\n", " json.dump(model_grid, fh)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "mkBsiCrA6bXS" }, "source": [ "for m in model_grid: \n", " with open(rootdir + m['slug'] + '___train_predictions.npy', 'r') as fh:\n", " train_logits = np.load(fh)\n", "\n", " train_predictions = np.argmax(train_logits, axis=1)\n", " labels = np.argmax(train_labels, axis=1)\n", " train_prediction_diff = train_predictions - labels\n", "\n", " np.count_nonzero(train_prediction_diff)\n", " m['accuracy'] = 1 - np.count_nonzero(train_prediction_diff)/60000\n", " for i in range(10):\n", " indices = np.where(labels == i)\n", " accuracy = 1 - np.count_nonzero(train_prediction_diff[indices])/indices[0].size\n", " m['accuracy_' + str(i)] = accuracy\n", "\n", "with open(rootdir + 'model_grid_train_accuracy.json', 'w') as fh:\n", " json.dump(model_grid, fh)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "_HxlZq005F7c" }, "source": [ "# Save true confidence labels" ] }, { "cell_type": "code", "metadata": { "id": "e9_MrWBt8qpc" }, "source": [ "import pandas as pd\n", "\n", "%load_ext google.colab.data_table" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QM-k-krN6bbw" }, "source": [ "def saveTestLables(dataset_size):\n", " tidyTest = []\n", " for i, label in enumerate(np.argmax(test_labels, axis=1)):\n", " rv = {\n", " 'i': i,\n", " 'label': label,\n", " }\n", " tidyTest.append(rv)\n", "\n", " for m in [m for m in model_grid if m['dataset_size'] == dataset_size]:\n", " with open(rootdir + m['slug'] + '___test_predictions.npy', 'r') as fh:\n", " test_logits = np.load(fh)\n", "\n", " for i, rv in enumerate(tidyTest):\n", " rv['aVal_' + str(m['aVal'])] = test_logits[i][rv['label']]\n", "\n", " df = pd.DataFrame(tidyTest)\n", "\n", " with open(rootdir + 'grid_' + str(dataset_size) + 'trainpoints_test_labels.csv', 'w') as outfile:\n", " df.to_csv(outfile)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "uODblIrH9HBM" }, "source": [ "for dataset_size in dataset_sizes:\n", " saveTestLables(dataset_size)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "pd7_Nm3ldfSa" }, "source": [ "# Save raw pixels by digit" ] }, { "cell_type": "code", "metadata": { "id": "SM8JWfYZlqvk" }, "source": [ "train_data_orig, train_labels_orig = train" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "1Yym3eVge3U2" }, "source": [ "labels = np.argmax(train_labels, axis=1)\n", "\n", "f, axarr = plt.subplots(1, 8) \n", "for i in range(8):\n", " axarr[i].imshow(train_data_orig[labels == 2][i])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "mffTPetDdvsH" }, "source": [ "for i in range(10):\n", " indices = np.where(labels == i)\n", " with open(rootdir + 'mnist_train_raw_' + str(i) + '.npy', 'w') as fh:\n", " np.save(fh, train_data_orig[labels == i])" ], "execution_count": null, "outputs": [] } ] }