In [None]:
import tensorflow as tf

tf.compat.v1.disable_v2_behavior()
tf.get_logger().setLevel('ERROR')

import json
import math
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!pip install tensorflow_privacy

from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

In [None]:
train, test = tf.keras.datasets.mnist.load_data()
train_data, train_labels = train
test_data, test_labels = test

train_data = np.array(train_data, dtype=np.float32) / 255
test_data = np.array(test_data, dtype=np.float32) / 255

train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)
test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)

train_labels = np.array(train_labels, dtype=np.int32)
test_labels = np.array(test_labels, dtype=np.int32)

train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=10)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=10)

assert train_data.min() == 0.
assert train_data.max() == 1.
assert test_data.min() == 0.
assert test_data.max() == 1.

In [None]:
!mkdir -p model-grid
rootdir = 'model-grid/'
batch_size = 250

In [None]:
model_grid = []

dataset_sizes = [math.ceil(60000/d/batch_size)*batch_size for d in [1, 2, 4, 8, 16, 32, 64]]
aVals = [1/32, 1/16, 1/8, .25, .5, 1, 2, 4, 8, 16, 32]

for dataset_size in dataset_sizes:
 for aVal in aVals:
 model_grid.append({
 'dataset_size': dataset_size,
 'aVal': aVal,
 'l2_norm_clip': 1.5*aVal,
 'noise_multiplier': 1.3/aVal,
 'epochs': 4,
 })

for m in model_grid:
 m['slug'] = 'grid__datasetsize_' + str(m['dataset_size']) + '__l2_norm_clip_' + str(m['l2_norm_clip']) + '__noise_multiplier_' + str(m['noise_multiplier']) + '__epochs_' + str(m['epochs'])

In [None]:
def calc_model(m):
 path = rootdir + m['slug']

 ## skip models with existing test_predictions
 try:
 test_path = path + '___test_predictions.npy'
 print(test_path)
 with open(test_path, 'r') as fh:
 return
 except Exception as e:
 print('no cache, training')
 print(m)

 batch_size = 250
 learning_rate = 0.25

 if batch_size % num_microbatches != 0:
 raise ValueError('Batch size should be an integer multiple of the number of microbatches')

 model = tf.keras.Sequential([
 tf.keras.layers.Conv2D(16, 8,
 strides=2,
 padding='same',
 activation='relu',
 input_shape=(28, 28, 1)),
 tf.keras.layers.MaxPool2D(2, 1),
 tf.keras.layers.Conv2D(32, 4,
 strides=2,
 padding='valid',
 activation='relu'),
 tf.keras.layers.MaxPool2D(2, 1),
 tf.keras.layers.Flatten(),
 tf.keras.layers.Dense(32, activation='relu'),
 tf.keras.layers.Dense(10)
 ])

 optimizer = DPKerasSGDOptimizer(
 l2_norm_clip=m['l2_norm_clip'],
 noise_multiplier=m['noise_multiplier'],
 num_microbatches=num_microbatches,
 learning_rate=learning_rate)

 loss = tf.keras.losses.CategoricalCrossentropy(
 from_logits=True, reduction=tf.losses.Reduction.NONE)
 
 model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

 model.fit(train_data[0: m['dataset_size']], train_labels[0: m['dataset_size']],
 epochs=m['epochs'],
 validation_data=(test_data, test_labels),
 batch_size=batch_size)
 
 predictions = model.predict(train_data)
 percents = tf.compat.v2.nn.softmax(predictions)
 percents = percents.eval(session=tf.compat.v1.Session())
 with open(path + '___train_predictions.npy', 'w') as fh:
 np.save(fh, percents)

 predictions = model.predict(test_data)
 percents = tf.compat.v2.nn.softmax(predictions)
 percents = percents.eval(session=tf.compat.v1.Session())
 with open(path + '___test_predictions.npy', 'w') as fh:
 np.save(fh, percents)

In [None]:
# for m in model_grid:
# calc_model(m)

In [None]:
for m in model_grid:
 eps, delta = compute_dp_sgd_privacy.compute_dp_sgd_privacy(
 n=m['dataset_size'],
 batch_size=batch_size,
 noise_multiplier=m['noise_multiplier'],
 epochs=m['epochs'],
 delta=1e-5)
 
 m['epsilon'] = eps
 with open(rootdir + m['slug'] + '___metadata.json', 'w') as fh:
 json.dump(m, fh)

# Save overall test and train accuracy stats

In [None]:
for m in model_grid: 
 with open(rootdir + m['slug'] + '___test_predictions.npy', 'r') as fh:
 test_logits = np.load(fh)

 test_predictions = np.argmax(test_logits, axis=1)
 labels = np.argmax(test_labels, axis=1)
 test_prediction_diff = test_predictions - labels

 np.count_nonzero(test_prediction_diff)
 m['accuracy'] = 1 - np.count_nonzero(test_prediction_diff)/10000
 for i in range(10):
 indices = np.where(labels == i)
 accuracy = 1 - np.count_nonzero(test_prediction_diff[indices])/indices[0].size
 m['accuracy_' + str(i)] = accuracy

with open(rootdir + 'model_grid_test_accuracy.json', 'w') as fh:
 json.dump(model_grid, fh)

In [None]:
for m in model_grid: 
 with open(rootdir + m['slug'] + '___train_predictions.npy', 'r') as fh:
 train_logits = np.load(fh)

 train_predictions = np.argmax(train_logits, axis=1)
 labels = np.argmax(train_labels, axis=1)
 train_prediction_diff = train_predictions - labels

 np.count_nonzero(train_prediction_diff)
 m['accuracy'] = 1 - np.count_nonzero(train_prediction_diff)/60000
 for i in range(10):
 indices = np.where(labels == i)
 accuracy = 1 - np.count_nonzero(train_prediction_diff[indices])/indices[0].size
 m['accuracy_' + str(i)] = accuracy

with open(rootdir + 'model_grid_train_accuracy.json', 'w') as fh:
 json.dump(model_grid, fh)

# Save true confidence labels

In [None]:
import pandas as pd

%load_ext google.colab.data_table

In [None]:
def saveTestLables(dataset_size):
 tidyTest = []
 for i, label in enumerate(np.argmax(test_labels, axis=1)):
 rv = {
 'i': i,
 'label': label,
 }
 tidyTest.append(rv)

 for m in [m for m in model_grid if m['dataset_size'] == dataset_size]:
 with open(rootdir + m['slug'] + '___test_predictions.npy', 'r') as fh:
 test_logits = np.load(fh)

 for i, rv in enumerate(tidyTest):
 rv['aVal_' + str(m['aVal'])] = test_logits[i][rv['label']]

 df = pd.DataFrame(tidyTest)

 with open(rootdir + 'grid_' + str(dataset_size) + 'trainpoints_test_labels.csv', 'w') as outfile:
 df.to_csv(outfile)

In [None]:
for dataset_size in dataset_sizes:
 saveTestLables(dataset_size)

# Save raw pixels by digit

In [None]:
train_data_orig, train_labels_orig = train

In [None]:
labels = np.argmax(train_labels, axis=1)

f, axarr = plt.subplots(1, 8) 
for i in range(8):
 axarr[i].imshow(train_data_orig[labels == 2][i])

In [None]:
for i in range(10):
 indices = np.where(labels == i)
 with open(rootdir + 'mnist_train_raw_' + str(i) + '.npy', 'w') as fh:
 np.save(fh, train_data_orig[labels == i])