{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "MNIST DP - Rotated", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "Zu3JJna0joDF" }, "source": [ "# Load packages" ] }, { "cell_type": "code", "metadata": { "id": "hQDkXr219_Be" }, "source": [ "import tensorflow as tf\n", "\n", "tf.compat.v1.disable_v2_behavior()\n", "tf.get_logger().setLevel('ERROR')\n", "\n", "import json\n", "import math\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import gc" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kXGpiaEA-AOT" }, "source": [ "!pip install tensorflow_privacy\n", "\n", "from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy\n", "from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ljBUcuXujsun" }, "source": [ "# Load MNIST digits and rotate" ] }, { "cell_type": "code", "metadata": { "id": "hhVshX5g-CIc" }, "source": [ "train, test = tf.keras.datasets.mnist.load_data()\n", "train_data, train_labels = train\n", "test_data, test_labels = test\n", "\n", "train_data = np.array(train_data, dtype=np.float32) / 255\n", "test_data = np.array(test_data, dtype=np.float32) / 255\n", "\n", "train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)\n", "test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)\n", "\n", "train_labels = np.array(train_labels, dtype=np.int32)\n", "test_labels = np.array(test_labels, dtype=np.int32)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "N4amBtp2F1cF" }, "source": [ "# Grab the 6000 train examples for each digit\n", "train_data_1, train_data_1_rot = np.split(train_data[np.where(train_labels == 1)][0:6000], 2)\n", "train_data_7, train_data_7_rot = np.split(train_data[np.where(train_labels == 7)][0:6000], 2)\n", "\n", "# Rotate half of them 90 degrees\n", "train_data_1_rot = np.rot90(train_data_1_rot, axes=(1,2))\n", "train_data_7_rot = np.rot90(train_data_7_rot, axes=(1,2))\n", "\n", "# Original and non-rotated test digits\n", "test_data_1 = test_data[np.where(test_labels == 1)][0:1000]\n", "test_data_7 = test_data[np.where(test_labels == 7)][0:1000]\n", "\n", "test_data_1_rot = np.rot90(test_data_1, axes=(1,2))\n", "test_data_7_rot = np.rot90(test_data_7, axes=(1,2))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ddIPSgEFF1ea" }, "source": [ "cat_labels_1 = np.repeat(np.array([[1, 0]], dtype=np.int32), 6000, axis=0)\n", "cat_labels_7 = np.repeat(np.array([[0, 1]], dtype=np.int32), 6000, axis=0)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ksPS1Nqbj8J0" }, "source": [ "# Generate grid of models to train" ] }, { "cell_type": "code", "metadata": { "id": "VAgoY8CaIGlJ" }, "source": [ "!mkdir -p rotated_v2__1_72\n", "rootdir = 'rotated_v2__1_72/'\n", "batch_size = 200\n", "num_microbatches = 200" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7fA0llg8HvY_" }, "source": [ "model_grid = []\n", "\n", "dataset_sizes = [math.ceil(6000/d/batch_size)*batch_size for d in [1, 1.5, 2, 3, 4, 6, 8, 12, 16, 32]]\n", "aVals = [1/32, 1/16, 1/8, .25, .5, 1, 2, 4, 8, 16, 32]\n", "minority_percents = [0.0, .05, .10, .15, .20, .25, .30, .35, .40, .45, .50]\n", "run_numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 20, 21, 22]\n", "\n", "for run_number in run_numbers:\n", " for dataset_size in dataset_sizes:\n", " for aVal in aVals:\n", " for minority_percent in minority_percents:\n", " model_grid.append({\n", " 'run_number': run_number,\n", " 'dataset_size': dataset_size,\n", " 'aVal': aVal,\n", " 'minority_percent': minority_percent,\n", " 'l2_norm_clip': 1.5*aVal,\n", " 'noise_multiplier': 1.3/aVal,\n", " 'epochs': 4,\n", " })\n", "\n", "for m in model_grid:\n", " m['slug'] = 'grid__runnumber_' + str(m['run_number']) + 'datasetsize_' + str(m['dataset_size']) + '__minority_percent_' + str(m['minority_percent']) + '__l2_norm_clip_' + str(m['l2_norm_clip']) + '__noise_multiplier_' + str(m['noise_multiplier']) + '__epochs_' + str(m['epochs'])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "K4R3Qds7hjUs" }, "source": [ "dataset_sizes" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "xwvNQEEkkCF5" }, "source": [ "# Train models" ] }, { "cell_type": "code", "metadata": { "id": "jihb5KnuHvqX" }, "source": [ "def calc_model(m):\n", " path = rootdir + m['slug']\n", "\n", " # skip models with existing test_predictions\n", " try:\n", " test_path = path + '___test_data_7_rot.npy'\n", " print(test_path)\n", " with open(test_path, 'r') as fh:\n", " return\n", " except Exception as e:\n", " print('no cache, training')\n", " print(m)\n", "\n", " if batch_size % num_microbatches != 0:\n", " raise ValueError('Batch size should be an integer multiple of the number of microbatches')\n", "\n", " model = tf.keras.Sequential([\n", " tf.keras.layers.Conv2D(16, 8,\n", " strides=2,\n", " padding='same',\n", " activation='relu',\n", " input_shape=(28, 28, 1)),\n", " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Conv2D(32, 4,\n", " strides=2,\n", " padding='valid',\n", " activation='relu'),\n", " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Flatten(),\n", " tf.keras.layers.Dense(32, activation='relu'),\n", " tf.keras.layers.Dense(2)\n", " ])\n", "\n", " optimizer = DPKerasSGDOptimizer(\n", " l2_norm_clip=m['l2_norm_clip'],\n", " noise_multiplier=m['noise_multiplier'],\n", " num_microbatches=num_microbatches,\n", " learning_rate=0.25)\n", "\n", " loss = tf.keras.losses.CategoricalCrossentropy(\n", " from_logits=True, reduction=tf.losses.Reduction.NONE)\n", " \n", " model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])\n", "\n", " p = m['minority_percent']\n", " n = m['dataset_size']\n", " n_straight = int(.5*n*(1 - p))\n", " n_rot = int(.5*n*p)\n", "\n", " m_train_data = np.concatenate((\n", " train_data_1[0:n_straight],\n", " train_data_1_rot[0:n_rot],\n", " train_data_7[0:n_straight],\n", " train_data_7_rot[0:n_rot],\n", " ))\n", "\n", " m_train_labels = np.concatenate((\n", " cat_labels_1[0:int(.5*n)],\n", " cat_labels_7[0:int(.5*n)],\n", " ))\n", " model.fit(\n", " m_train_data, \n", " m_train_labels,\n", " epochs=m['epochs'],\n", " batch_size=batch_size)\n", " \n", " model.save(path)\n", "\n", " def run_inference(data, slug, target_index):\n", " predictions = model.predict(data)\n", " percents = tf.compat.v2.nn.softmax(predictions)\n", " percents = percents.eval(session=tf.compat.v1.Session())\n", " with open(path + '___' + slug + '.npy', 'w') as fh:\n", " np.save(fh, percents)\n", "\n", " count = 0 \n", " for d in percents:\n", " if (d[target_index] > .5):\n", " count = count + 1\n", " print(slug, count)\n", "\n", "\n", " run_inference(test_data_1, 'test_data_1', 0)\n", " run_inference(test_data_7, 'test_data_7', 1)\n", "\n", " run_inference(test_data_1_rot, 'test_data_1_rot', 0)\n", " run_inference(test_data_7_rot, 'test_data_7_rot', 1)\n", "\n", " # Fixes OOM errors\n", " del m_train_data \n", " del m_train_labels\n", " del model\n", " \n", " gc.collect()\n", " tf.keras.backend.clear_session()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8xQOLBpjhAWS" }, "source": [ "for m in model_grid:\n", " calc_model(m)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "tV4GiBLLkNo5" }, "source": [ "# Calculate metadata" ] }, { "cell_type": "code", "metadata": { "id": "nWfeqTy3kbTO" }, "source": [ "# Calculate model accuracy\n", "for m in model_grid:\n", " path = rootdir + m['slug']\n", "\n", " def calc_percents(data, slug, target_index):\n", " try:\n", " with gfile.GFile(path + '___' + slug + '.npy', 'r') as fh:\n", " percents = np.load(fh)\n", " except Exception as e:\n", " # print(e)\n", " return print('missing ' + path + '___' + slug + '.npy')\n", "\n", " count = 0 \n", " for d in percents:\n", " if (d[target_index] > .5):\n", " count = count + 1\n", " m['accuracy_' + slug] = count\n", "\n", "\n", " calc_percents(test_data_1, 'test_data_1', 0)\n", " calc_percents(test_data_7, 'test_data_7', 1)\n", "\n", " calc_percents(test_data_1_rot, 'test_data_1_rot', 0)\n", " calc_percents(test_data_7_rot, 'test_data_7_rot', 1)\n", "\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NEucNySYAaCe" }, "source": [ "from functools import lru_cache\n", "\n", "@lru_cache(maxsize=None)\n", "def computeModelPrivacy(dataset_size, noise_multiplier, epochs):\n", " eps, delta = compute_dp_sgd_privacy.compute_dp_sgd_privacy(\n", " n=dataset_size,\n", " batch_size=batch_size,\n", " noise_multiplier=noise_multiplier,\n", " epochs=epochs,\n", " delta=1e-5)\n", " \n", " return eps" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "l2LK-eoVTrS_" }, "source": [ "# Calculate epsilon for each model\n", "for m in model_grid:\n", " m['epsilon'] = computeModelPrivacy(m['dataset_size'], m['noise_multiplier'], m['epochs'])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "G95W8RXsAaJR" }, "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame(model_grid).drop(['slug'], axis=1).to_csv(rootdir + 'model_grid_train_accuracy.csv', index=False)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "%download_file rotated_v2__1_72/model_grid_train_accuracy.csv" ], "metadata": { "id": "tpysHyxbzSnu" }, "execution_count": null, "outputs": [] } ] }