{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "MNIST DP - Rotated",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Zu3JJna0joDF"
      },
      "source": [
        "# Load packages"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hQDkXr219_Be"
      },
      "source": [
        "import tensorflow as tf\n",
        "\n",
        "tf.compat.v1.disable_v2_behavior()\n",
        "tf.get_logger().setLevel('ERROR')\n",
        "\n",
        "import json\n",
        "import math\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import gc"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kXGpiaEA-AOT"
      },
      "source": [
        "!pip install tensorflow_privacy\n",
        "\n",
        "from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy\n",
        "from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ljBUcuXujsun"
      },
      "source": [
        "# Load MNIST digits and rotate"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hhVshX5g-CIc"
      },
      "source": [
        "train, test = tf.keras.datasets.mnist.load_data()\n",
        "train_data, train_labels = train\n",
        "test_data, test_labels = test\n",
        "\n",
        "train_data = np.array(train_data, dtype=np.float32) / 255\n",
        "test_data = np.array(test_data, dtype=np.float32) / 255\n",
        "\n",
        "train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)\n",
        "test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)\n",
        "\n",
        "train_labels = np.array(train_labels, dtype=np.int32)\n",
        "test_labels = np.array(test_labels, dtype=np.int32)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "N4amBtp2F1cF"
      },
      "source": [
        "# Grab the 6000 train examples for each digit\n",
        "train_data_1, train_data_1_rot = np.split(train_data[np.where(train_labels == 1)][0:6000], 2)\n",
        "train_data_7, train_data_7_rot = np.split(train_data[np.where(train_labels == 7)][0:6000], 2)\n",
        "\n",
        "# Rotate half of them 90 degrees\n",
        "train_data_1_rot = np.rot90(train_data_1_rot, axes=(1,2))\n",
        "train_data_7_rot = np.rot90(train_data_7_rot, axes=(1,2))\n",
        "\n",
        "# Original and non-rotated test digits\n",
        "test_data_1 = test_data[np.where(test_labels == 1)][0:1000]\n",
        "test_data_7 = test_data[np.where(test_labels == 7)][0:1000]\n",
        "\n",
        "test_data_1_rot = np.rot90(test_data_1, axes=(1,2))\n",
        "test_data_7_rot = np.rot90(test_data_7, axes=(1,2))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ddIPSgEFF1ea"
      },
      "source": [
        "cat_labels_1 = np.repeat(np.array([[1, 0]], dtype=np.int32), 6000, axis=0)\n",
        "cat_labels_7 = np.repeat(np.array([[0, 1]], dtype=np.int32), 6000, axis=0)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ksPS1Nqbj8J0"
      },
      "source": [
        "# Generate grid of models to train"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VAgoY8CaIGlJ"
      },
      "source": [
        "!mkdir -p rotated_v2__1_72\n",
        "rootdir = 'rotated_v2__1_72/'\n",
        "batch_size = 200\n",
        "num_microbatches = 200"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7fA0llg8HvY_"
      },
      "source": [
        "model_grid = []\n",
        "\n",
        "dataset_sizes = [math.ceil(6000/d/batch_size)*batch_size for d in [1, 1.5, 2, 3, 4, 6, 8, 12, 16, 32]]\n",
        "aVals = [1/32, 1/16, 1/8, .25, .5, 1, 2, 4, 8, 16, 32]\n",
        "minority_percents = [0.0, .05, .10, .15, .20, .25, .30, .35, .40, .45, .50]\n",
        "run_numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 20, 21, 22]\n",
        "\n",
        "for run_number in run_numbers:\n",
        "  for dataset_size in dataset_sizes:\n",
        "    for aVal in aVals:\n",
        "      for minority_percent in minority_percents:\n",
        "        model_grid.append({\n",
        "          'run_number': run_number,\n",
        "          'dataset_size': dataset_size,\n",
        "          'aVal': aVal,\n",
        "          'minority_percent': minority_percent,\n",
        "          'l2_norm_clip': 1.5*aVal,\n",
        "          'noise_multiplier': 1.3/aVal,\n",
        "          'epochs': 4,\n",
        "        })\n",
        "\n",
        "for m in model_grid:\n",
        "  m['slug'] = 'grid__runnumber_' + str(m['run_number']) + 'datasetsize_' + str(m['dataset_size']) + '__minority_percent_' + str(m['minority_percent']) + '__l2_norm_clip_' + str(m['l2_norm_clip']) + '__noise_multiplier_' + str(m['noise_multiplier']) + '__epochs_' + str(m['epochs'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K4R3Qds7hjUs"
      },
      "source": [
        "dataset_sizes"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xwvNQEEkkCF5"
      },
      "source": [
        "# Train models"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jihb5KnuHvqX"
      },
      "source": [
        "def calc_model(m):\n",
        "  path = rootdir + m['slug']\n",
        "\n",
        "  # skip models with existing test_predictions\n",
        "  try:\n",
        "    test_path = path + '___test_data_7_rot.npy'\n",
        "    print(test_path)\n",
        "    with open(test_path, 'r') as fh:\n",
        "      return\n",
        "  except Exception as e:\n",
        "    print('no cache, training')\n",
        "    print(m)\n",
        "\n",
        "  if batch_size % num_microbatches != 0:\n",
        "    raise ValueError('Batch size should be an integer multiple of the number of microbatches')\n",
        "\n",
        "  model = tf.keras.Sequential([\n",
        "    tf.keras.layers.Conv2D(16, 8,\n",
        "                           strides=2,\n",
        "                           padding='same',\n",
        "                           activation='relu',\n",
        "                           input_shape=(28, 28, 1)),\n",
        "    tf.keras.layers.MaxPool2D(2, 1),\n",
        "    tf.keras.layers.Conv2D(32, 4,\n",
        "                           strides=2,\n",
        "                           padding='valid',\n",
        "                           activation='relu'),\n",
        "    tf.keras.layers.MaxPool2D(2, 1),\n",
        "    tf.keras.layers.Flatten(),\n",
        "    tf.keras.layers.Dense(32, activation='relu'),\n",
        "    tf.keras.layers.Dense(2)\n",
        "  ])\n",
        "\n",
        "  optimizer = DPKerasSGDOptimizer(\n",
        "    l2_norm_clip=m['l2_norm_clip'],\n",
        "    noise_multiplier=m['noise_multiplier'],\n",
        "    num_microbatches=num_microbatches,\n",
        "    learning_rate=0.25)\n",
        "\n",
        "  loss = tf.keras.losses.CategoricalCrossentropy(\n",
        "    from_logits=True, reduction=tf.losses.Reduction.NONE)\n",
        "  \n",
        "  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])\n",
        "\n",
        "  p = m['minority_percent']\n",
        "  n = m['dataset_size']\n",
        "  n_straight = int(.5*n*(1 - p))\n",
        "  n_rot = int(.5*n*p)\n",
        "\n",
        "  m_train_data = np.concatenate((\n",
        "    train_data_1[0:n_straight],\n",
        "    train_data_1_rot[0:n_rot],\n",
        "    train_data_7[0:n_straight],\n",
        "    train_data_7_rot[0:n_rot],\n",
        "  ))\n",
        "\n",
        "  m_train_labels = np.concatenate((\n",
        "    cat_labels_1[0:int(.5*n)],\n",
        "    cat_labels_7[0:int(.5*n)],\n",
        "  ))\n",
        "  model.fit(\n",
        "    m_train_data, \n",
        "    m_train_labels,\n",
        "    epochs=m['epochs'],\n",
        "    batch_size=batch_size)\n",
        "  \n",
        "  model.save(path)\n",
        "\n",
        "  def run_inference(data, slug, target_index):\n",
        "    predictions = model.predict(data)\n",
        "    percents = tf.compat.v2.nn.softmax(predictions)\n",
        "    percents = percents.eval(session=tf.compat.v1.Session())\n",
        "    with open(path + '___' + slug + '.npy', 'w') as fh:\n",
        "      np.save(fh, percents)\n",
        "\n",
        "    count = 0 \n",
        "    for d in percents:\n",
        "      if (d[target_index] > .5):\n",
        "        count = count + 1\n",
        "    print(slug, count)\n",
        "\n",
        "\n",
        "  run_inference(test_data_1, 'test_data_1', 0)\n",
        "  run_inference(test_data_7, 'test_data_7', 1)\n",
        "\n",
        "  run_inference(test_data_1_rot, 'test_data_1_rot', 0)\n",
        "  run_inference(test_data_7_rot, 'test_data_7_rot', 1)\n",
        "\n",
        "  # Fixes OOM errors\n",
        "  del m_train_data  \n",
        "  del m_train_labels\n",
        "  del model\n",
        "  \n",
        "  gc.collect()\n",
        "  tf.keras.backend.clear_session()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8xQOLBpjhAWS"
      },
      "source": [
        "for m in model_grid:\n",
        "  calc_model(m)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tV4GiBLLkNo5"
      },
      "source": [
        "# Calculate metadata"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nWfeqTy3kbTO"
      },
      "source": [
        "# Calculate model accuracy\n",
        "for m in model_grid:\n",
        "  path = rootdir + m['slug']\n",
        "\n",
        "  def calc_percents(data, slug, target_index):\n",
        "    try:\n",
        "      with gfile.GFile(path + '___' + slug + '.npy', 'r') as fh:\n",
        "        percents = np.load(fh)\n",
        "    except Exception as e:\n",
        "      # print(e)\n",
        "      return print('missing ' + path + '___' + slug + '.npy')\n",
        "\n",
        "    count = 0 \n",
        "    for d in percents:\n",
        "      if (d[target_index] > .5):\n",
        "        count = count + 1\n",
        "    m['accuracy_' + slug] = count\n",
        "\n",
        "\n",
        "  calc_percents(test_data_1, 'test_data_1', 0)\n",
        "  calc_percents(test_data_7, 'test_data_7', 1)\n",
        "\n",
        "  calc_percents(test_data_1_rot, 'test_data_1_rot', 0)\n",
        "  calc_percents(test_data_7_rot, 'test_data_7_rot', 1)\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NEucNySYAaCe"
      },
      "source": [
        "from functools import lru_cache\n",
        "\n",
        "@lru_cache(maxsize=None)\n",
        "def computeModelPrivacy(dataset_size, noise_multiplier, epochs):\n",
        "  eps, delta = compute_dp_sgd_privacy.compute_dp_sgd_privacy(\n",
        "    n=dataset_size,\n",
        "    batch_size=batch_size,\n",
        "    noise_multiplier=noise_multiplier,\n",
        "    epochs=epochs,\n",
        "    delta=1e-5)\n",
        "  \n",
        "  return eps"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l2LK-eoVTrS_"
      },
      "source": [
        "# Calculate epsilon for each model\n",
        "for m in model_grid:\n",
        "  m['epsilon'] = computeModelPrivacy(m['dataset_size'], m['noise_multiplier'], m['epochs'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G95W8RXsAaJR"
      },
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.DataFrame(model_grid).drop(['slug'], axis=1).to_csv(rootdir + 'model_grid_train_accuracy.csv', index=False)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%download_file rotated_v2__1_72/model_grid_train_accuracy.csv"
      ],
      "metadata": {
        "id": "tpysHyxbzSnu"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}