{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Generate MNIST UMAP", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "woLPAhWA-_Cw" }, "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import tensorflow as tf\n", "tf.compat.v1.disable_v2_behavior()\n", "tf.get_logger().setLevel('ERROR')\n", "\n", "!pip install umap-learn\n", "import umap" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Download data\n", "\n", "MNIST privacy rankings from [Distribution Density, Tails, and Outliers in Machine Learning: Metrics and Applications](https://arxiv.org/abs/1910.13427).\n" ], "metadata": { "id": "y3QUm2GIadN8" } }, { "cell_type": "code", "metadata": { "id": "r3Psd6pUgW4n" }, "source": [ "%%capture\n", "!curl -L https://github.com/tensorflow/privacy/releases/download/0.2.3/order.tgz -o order.tgz\n", "!tar zxvf order.tgz" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "02RTczc-jBgY" }, "source": [ "mnist_priv_train = np.load('data/order_mnist_priv_train.npy')\n", "mnist_priv_test = np.load('data/order_mnist_priv_test.npy')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EyGXBXKMjRzb" }, "source": [ "mnist_priv_train.shape" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "uAhuQBSmmu5m" }, "source": [ "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n", "x_train_orig = x_train" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Ir56NuiFm-8J" }, "source": [ "x_train.shape" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Vvu69K3pfMTq" }, "source": [ "# The top and bottom \"3\" digits" ] }, { "cell_type": "code", "metadata": { "id": "Ha4uIKzJbXYJ" }, "source": [ "trainList = []\n", "\n", "for i, d in enumerate(np.argsort(mnist_priv_train)):\n", " trainList.append({\n", " 'priv_order': d,\n", " 'y': y_train[i],\n", " 'i': i\n", " })\n", "\n", "df = pd.DataFrame(trainList)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "2mPM4pMFeFBf" }, "source": [ "top3df = df[df['y'] == 3].sort_values(['priv_order'], ascending=True).head(10)\n", "\n", "f, axarr = plt.subplots(1, 10) \n", "for i, d in enumerate(top3df['i'].to_list()):\n", " axarr[i].imshow(x_train[d])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "VASbYUVAcQcb" }, "source": [ "bot3df = df[df['y'] == 3].sort_values(['priv_order'], ascending=False).head(10)\n", "\n", "f, axarr = plt.subplots(1, 10) \n", "for i, d in enumerate(bot3df['i'].to_list()):\n", " axarr[i].imshow(x_train[d])" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "sV6HrHUiQRUZ" }, "source": [ "# UMAP\n", "\n", "Embeddings of each training MNIST digit training projected with UMAP. \n" ] }, { "cell_type": "code", "source": [ "train, test = tf.keras.datasets.mnist.load_data()\n", "train_data, train_labels = train\n", "test_data, test_labels = test\n", "\n", "train_data = np.array(train_data, dtype=np.float32) / 255\n", "test_data = np.array(test_data, dtype=np.float32) / 255\n", "\n", "train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)\n", "test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)\n", "\n", "train_labels = np.array(train_labels, dtype=np.int32)\n", "test_labels = np.array(test_labels, dtype=np.int32)\n", "\n", "train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=10)\n", "test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=10)" ], "metadata": { "id": "xz4uRcNfZHax" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "g8ZpJq600F60" }, "source": [ "model_784 = tf.keras.Sequential([\n", " tf.keras.layers.Conv2D(16, 8,\n", " strides=2,\n", " padding='same',\n", " activation='relu',\n", " input_shape=(28, 28, 1)),\n", " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Conv2D(32, 4,\n", " strides=2,\n", " padding='valid',\n", " activation='relu'),\n", " tf.keras.layers.MaxPool2D(2, 1),\n", " tf.keras.layers.Flatten(),\n", " tf.keras.layers.Dense(28*28, activation='relu', name='embedding_784'), \n", " tf.keras.layers.Dense(32, activation='relu', name='embedding_32'), \n", " tf.keras.layers.Dense(10, name='logit')\n", "])\n", "\n", "model_784.summary()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QSdxK18VQdr6" }, "source": [ "loss = tf.keras.losses.CategoricalCrossentropy(\n", " from_logits=True, reduction=tf.losses.Reduction.NONE)\n", "\n", "model_784.compile(loss=loss, optimizer=\"adam\", metrics=[\"accuracy\"])\n", "\n", "model_784.fit(\n", " train_data, \n", " train_labels, \n", " validation_data=(test_data, test_labels),\n", " epochs=10, \n", " batch_size=250,\n", " verbose=2,\n", ")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cupnPf1MQz1E" }, "source": [ "embedding_layer_model_784 = tf.keras.Model(\n", " inputs=model_784.input, \n", " outputs=model_784.get_layer('embedding_784').output)\n", "\n", "with tf.compat.v1.Session() as sess:\n", " sess.run(tf.compat.v1.global_variables_initializer())\n", " train_embeddings_784 = sess.run(embedding_layer_model_784(train_data))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "w0YOzJStQ2Uq" }, "source": [ "def umapDigit(digit=0, embeddings=train_embeddings_784, digit_type='', slug='784_'):\n", " dfN = df[df['y'] == digit]\n", " embeddingsN = embeddings.take(dfN['i'].to_list(), axis=0)\n", "\n", " reducer = umap.UMAP(random_state=42, min_dist=.05, n_neighbors=8)\n", " umap_xy = reducer.fit_transform(embeddingsN)\n", "\n", " fig, ax = plt.subplots(figsize=(6, 6))\n", "\n", " color = dfN['priv_order']\n", " plt.scatter(umap_xy[:, 0], umap_xy[:, 1], c=color, cmap=\"Spectral\", s=3)\n", " plt.setp(ax, xticks=[], yticks=[])\n", " plt.title(\"MNIST \" + str(digit) + \" - UMAP\", fontsize=18)\n", "\n", " plt.show()\n", "\n", " rootdir = 'umap-digits/'\n", " outpath = rootdir + 'umap_train_' + slug + digit_type + str(digit) + '.npy'\n", " with open(outpath, 'w') as outfile:\n", " np.save(outfile, umap_xy)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "3-3T7e_vQ2XD" }, "source": [ "for i in range(0, 10):\n", " umapDigit(i)" ], "execution_count": null, "outputs": [] } ] }