flax-community
/

NeuralODE_SDE

Model card Files Files and versions Community

sw32-seo commited on Jul 16, 2021

Commit

5775f48

•

1 Parent(s): f559f97

Initial commit

Browse files

Files changed (13) hide show

LICENSE +21 -0
README.md +42 -0
cnn.py +219 -0
cnn_ode.py +256 -0
jax_cnn_ode.py +0 -0
main.py +23 -0
mlp.py +103 -0
ode.py +256 -0
opts.py +0 -0
train.py +130 -0
train_cnf.py +274 -0
train_ode.py +271 -0
train_resnet.py +195 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Seung-woo Eric Seo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Neural ODE with Flax
+This is the result of project ["Reproduce Neural ODE and SDE"][projectlink] in [HuggingFace Flax/JAX community week][comweeklink].
+<code>main.py</code> will execute training of ResNet or OdeNet for MNIST dataset.
+[projectlink]: https://discuss.huggingface.co/t/reproduce-neural-ode-and-neural-sde/7590
+[comweeklink]: https://github.com/huggingface/transformers/tree/master/examples/research_projects/jax-projects#projects
+## Dependency
+### JAX and Flax
+For JAX installation, please follow [here][jaxinstalllink].
+or simply, type
+```bash
+pip install jax jaxlib
+```
+For Flax installation,
+```bash
+pip install flax
+```
+[jaxinstalllink]: https://github.com/google/jax#installation
+Tensorflow-datasets will download MNIST dataset to environment.
+## How to run training
+For (small) ResNet training,
+```bash
+python main.py --model=resnet --lr=1e-4 --n_epoch=20 --batch_size=64
+```
+For Neural ODE training,
+```bash
+python main.py --model=odenet --lr=1e-4 --n_epoch=20 --batch_size=64
+```

cnn.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import jax
+from typing import Any, Callable, Sequence, Optional
+from jax import lax, random, vmap, numpy as jnp
+from jax.experimental.ode import odeint
+import flax
+from flax.training import train_state
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+import tensorflow_datasets as tfds
+import numpy as np
+# Define model
+class CNN(nn.Module):
+    """A simple CNN model."""
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        x = nn.Conv(features=32, kernel_size=(3, 3))(x)
+        x = nn.relu(x)
+        x = nn.avg_pool(x, window_shape=(2, 2), strides=(2, 2))
+        x = nn.Conv(features=64, kernel_size=(3, 3))(x)
+        x = nn.relu(x)
+        x = nn.avg_pool(x, window_shape=(2, 2), strides=(2, 2))
+        x = x.reshape((x.shape[0], -1))     # flatten
+        x = nn.Dense(features=256)(x)
+        x = nn.relu(x)
+        x = nn.Dense(features=10)(x)
+        x = nn.log_softmax(x)
+        return x
+# Define Residual Block
+class ResBlock(nn.Module):
+    """Single Resblock w/o downsample"""
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        f_x = nn.relu(nn.GroupNorm(64)(x))
+        f_x = nn.Conv(features=64, kernel_size=(3, 3))(f_x)
+        f_x = nn.relu(nn.GroupNorm(64)(f_x))
+        f_x = nn.Conv(features=64, kernel_size=(3, 3))(f_x)
+        x = f_x + x
+        return x
+class ResDownBlock(nn.Module):
+    """Single ResBlock w/ downsample"""
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        f_x = nn.relu(nn.GroupNorm(64)(x))
+        x = nn.Conv(features=64, kernel_size=(1, 1), strides=(2, 2))(x)
+        f_x = nn.Conv(features=64, kernel_size=(3, 3), strides=(2, 2))(f_x)
+        f_x = nn.relu(nn.GroupNorm(64)(f_x))
+        f_x = nn.Conv(features=64, kernel_size=(3, 3))(f_x)
+        x = f_x + x
+        return x
+# Define Model for Mnist example in Neural ODE
+class SmallResNet(nn.Module):
+    res_down1: Callable = ResDownBlock()
+    res_down2: Callable = ResDownBlock()
+    resblock1: Callable = ResBlock()
+    resblock2: Callable = ResBlock()
+    resblock3: Callable = ResBlock()
+    resblock4: Callable = ResBlock()
+    resblock5: Callable = ResBlock()
+    resblock6: Callable = ResBlock()
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        x = nn.Conv(features=64, kernel_size=(3, 3))(x)
+        x = self.res_down1(x)
+        x = self.res_down2(x)
+        x = self.resblock1(x)
+        x = self.resblock2(x)
+        x = self.resblock3(x)
+        x = self.resblock4(x)
+        x = self.resblock5(x)
+        x = self.resblock6(x)
+        x = nn.GroupNorm(64)(x)
+        x = nn.relu(x)
+        x = nn.avg_pool(x, (1, 1))
+        x = x.reshape((x.shape[0], -1))     # flatten
+        x = nn.Dense(features=10)(x)
+        x = nn.log_softmax(x)
+        return x
+# Define loss
+def cross_entropy_loss(*, logits, labels):
+    one_hot_labels = jax.nn.one_hot(labels, num_classes=10)
+    return -jnp.mean(jnp.sum(one_hot_labels * logits, axis=-1))
+# Metric computation
+def compute_metrics(*, logits, labels):
+    loss = cross_entropy_loss(logits=logits, labels=labels)
+    accuracy = jnp.mean(jnp.argmax(logits, -1) == labels)
+    metrics = {
+        'loss': loss,
+        'accuracy': accuracy,
+    }
+    return metrics
+def get_datasets():
+    """Load MNIST train and test datasets into memory."""
+    ds_builder = tfds.builder('mnist')
+    ds_builder.download_and_prepare()
+    train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1))
+    test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1))
+    train_ds['image'] = jnp.float32(train_ds['image']) / 255.
+    test_ds['image'] = jnp.float32(test_ds['image']) / 255.
+    return train_ds, test_ds
+def create_train_state(rng, learning_rate):
+    """Creates initial 'TrainState'."""
+    cnn = SmallResNet()
+    params = cnn.init(rng, jnp.ones([1, 28, 28, 1]))['params']
+    tx = optax.adam(learning_rate)
+    return train_state.TrainState.create(
+        apply_fn=cnn.apply, params=params, tx=tx
+    )
+# Training step
+@jax.jit
+def train_step(state, batch):
+    """Train for a single step."""
+    def loss_fn(params):
+        logits = SmallResNet().apply({'params': params}, batch['image'])
+        loss = cross_entropy_loss(logits=logits, labels=batch['label'])
+        return loss, logits
+    grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
+    (_, logits), grads = grad_fn(state.params)
+    state = state.apply_gradients(grads=grads)
+    metrics = compute_metrics(logits=logits, labels=batch['label'])
+    return state, metrics
+# Evaluation step
+@jax.jit
+def eval_step(params, batch):
+    logits = SmallResNet().apply({'params': params}, batch['image'])
+    return compute_metrics(logits=logits, labels=batch['label'])
+# Train function
+def train_epoch(state, train_ds, batch_size, epoch, rng):
+    """Train for a single epoch"""
+    train_ds_size = len(train_ds['image'])
+    steps_per_epoch = train_ds_size // batch_size
+    perms = jax.random.permutation(rng, len(train_ds['image']))
+    perms = perms[:steps_per_epoch * batch_size]    # skip incomplete batch
+    perms = perms.reshape((steps_per_epoch, batch_size))
+    batch_metrics = []
+    for perm in perms:
+        batch = {k: v[perm, ...] for k, v in train_ds.items()}
+        state, metrics = train_step(state, batch)
+        batch_metrics.append(metrics)
+        # compute mean of metrics across each batch in epoch.
+        batch_metrics_np = jax.device_get(batch_metrics)
+        epoch_metrics_np = {
+            k: np.mean([metrics[k] for metrics in batch_metrics_np])
+            for k in batch_metrics_np[0]
+        }
+    print('train epoch: %d, loss: %.4f, accuracy: %.2f' % (
+        epoch, epoch_metrics_np['loss'], epoch_metrics_np['accuracy'] * 100
+    ))
+    return state
+# Eval function
+def eval_model(params, test_ds):
+    metrics = eval_step(params, test_ds)
+    metrics = jax.device_get(metrics)
+    summary = jax.tree_map(lambda x: x.item(), metrics)
+    return summary['loss'], summary['accuracy']
+if __name__ == '__main__':
+    train_ds, test_ds = get_datasets()
+    rng = jax.random.PRNGKey(0)
+    rng, init_rng = jax.random.split(rng)
+    learning_rate = 0.0001
+    state = create_train_state(init_rng, learning_rate)
+    del init_rng  # Must not be used anymore.
+    num_epochs = 40
+    batch_size = 128
+    for epoch in range(1, num_epochs + 1):
+        rng, input_rng = jax.random.split(rng)
+        state = train_epoch(state, train_ds, batch_size, epoch, input_rng)
+        test_loss, test_accuracy = eval_model(state.params, test_ds)
+        print(' test epoch: %d, loss: %.2f, accuracy: %.2f' % (
+            epoch, test_loss, test_accuracy * 100
+        ))

cnn_ode.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from functools import partial
+import jax
+from typing import Any, Callable, Sequence, Optional, NewType
+from jax import lax, random, vmap, numpy as jnp
+from jax.experimental.ode import odeint
+import flax
+from flax.training import train_state
+from flax import traverse_util
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+import tensorflow_datasets as tfds
+import numpy as np
+from tqdm import tqdm
+import os
+# TODO Add system argument for dim_out, ksize, tol, learning_rate, num_epoch and batch_size
+# Define Residual Block
+class ResDownBlock(nn.Module):
+    """Single ResBlock w/ downsample"""
+    dim_out: Any = 64
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(x))
+        x = nn.Conv(features=self.dim_out, kernel_size=(1, 1), strides=(2, 2))(x)
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(3, 3), strides=(2, 2))(f_x)
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(f_x))
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(3, 3))(f_x)
+        x = f_x + x
+        return x
+class ConcatConv2D(nn.Module):
+    """Concat dynamics to hidden layer"""
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, x, t):
+        tt = jnp.ones_like(x[..., :1]) * t
+        ttx = jnp.concatenate([tt, x], -1)
+        return nn.Conv(features=self.dim_out, kernel_size=self.ksize)(ttx)
+# Define Model for Mnist example in Neural ODE
+class ODEfunc(nn.Module):
+    """ODE function which replace ResNet"""
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, inputs, t):
+        # TODO Count number of function estimation
+        # nfe_counter = NFEcounter()
+        # nfe_counter()
+        x = inputs
+        out = nn.GroupNorm(self.dim_out)(x)
+        out = nn.relu(out)
+        out = ConcatConv2D(self.dim_out, self.ksize)(out, t)
+        out = nn.GroupNorm(self.dim_out)(out)
+        out = nn.relu(out)
+        out = ConcatConv2D(self.dim_out, self.ksize)(out, t)
+        out = nn.GroupNorm(self.dim_out)(out)
+        return out
+class NFEcounter(nn.Module):
+    @nn.compact
+    def __call__(self):
+        is_initialized = self.has_variable('nfe', 'nfe')
+        nfe = self.variable('nfe', 'nfe', jnp.array, [0])
+        if is_initialized:
+            nfe.value += 1
+class ODEBlock(nn.Module):
+    """ODE block which contains odeint"""
+    tol = 1.
+    @nn.compact
+    def __call__(self, x, params):
+        ode_func = ODEfunc()
+        init_state, final_state = odeint(partial(ode_func.apply, {'params': params}),
+                                         x, jnp.array([0., 1.]),
+                                         rtol=self.tol, atol=self.tol)
+        return final_state
+class ODEBlockVmap(nn.Module):
+    """Apply vmap to ODEBlock"""
+    @nn.compact
+    def __call__(self, x, params):
+        vmap_odeblock = nn.vmap(ODEBlock,
+                                variable_axes={'params': 0, 'nfe': None},
+                                split_rngs={'params': True, 'nfe': False},
+                                in_axes=(0, None))
+        return vmap_odeblock(name='odeblock')(x, params)
+class FullODENet(nn.Module):
+    """Full ODE net which contains two downsampling layers, ODE block and linear classifier."""
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        x = nn.Conv(features=self.dim_out, kernel_size=(self.ksize, self.ksize))(x)
+        x = ResDownBlock()(x)
+        x = ResDownBlock()(x)
+        ode_func = ODEfunc()
+        init_fn = lambda rng, x: ode_func.init(random.split(rng)[-1], x, 0.)['params']
+        ode_func_params = self.param('ode_func', init_fn, jnp.ones_like(x[0]))
+        x = ODEBlockVmap()(x, ode_func_params)
+        x = nn.GroupNorm(self.dim_out)(x)
+        x = nn.relu(x)
+        x = nn.avg_pool(x, (1, 1))
+        x = x.reshape((x.shape[0], -1))     # flatten
+        x = nn.Dense(features=10)(x)
+        x = nn.log_softmax(x)
+        return x
+# Define loss
+@jax.jit
+def cross_entropy_loss(logits, labels):
+    one_hot_labels = jax.nn.one_hot(labels, num_classes=10)
+    return -jnp.mean(jnp.sum(one_hot_labels * logits, axis=-1))
+# Metric computation
+@jax.jit
+def compute_metrics(logits, labels):
+    loss = cross_entropy_loss(logits=logits, labels=labels)
+    accuracy = jnp.mean(jnp.argmax(logits, -1) == labels)
+    metrics = {
+        'loss': loss,
+        'accuracy': accuracy,
+    }
+    return metrics
+def get_datasets():
+    """Load MNIST train and test datasets into memory."""
+    ds_builder = tfds.builder('mnist')
+    ds_builder.download_and_prepare()
+    train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1))
+    test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1))
+    train_ds['image'] = jnp.float32(train_ds['image']) / 255.
+    test_ds['image'] = jnp.float32(test_ds['image']) / 255.
+    return train_ds, test_ds
+def create_train_state(rng, learning_rate):
+    """Creates initial 'TrainState'."""
+    cnn = FullODENet()
+    params = cnn.init(rng, jnp.ones([1, 28, 28, 1]))['params']
+    tx = optax.adam(learning_rate)
+    return train_state.TrainState.create(
+        apply_fn=cnn.apply, params=params, tx=tx
+    )
+# Training step
+@jax.jit
+def train_step(state, batch):
+    """Train for a single step."""
+    def loss_fn(params):
+        logits = FullODENet().apply({'params': params}, batch['image'])
+        loss = cross_entropy_loss(logits=logits, labels=batch['label'])
+        return loss, logits
+    grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
+    (_, logits), grads = grad_fn(state.params)
+    state = state.apply_gradients(grads=grads)
+    metrics = compute_metrics(logits=logits, labels=batch['label'])
+    return state, metrics
+# Evaluation step
+@jax.jit
+def eval_step(params, batch):
+    logits = FullODENet().apply({'params': params}, batch['image'])
+    return compute_metrics(logits=logits, labels=batch['label'])
+# Train function
+def train_epoch(state, train_ds, batch_size, epoch, rng):
+    """Train for a single epoch"""
+    train_ds_size = len(train_ds['image'])
+    steps_per_epoch = train_ds_size // batch_size
+    perms = jax.random.permutation(rng, len(train_ds['image']))
+    perms = perms[:steps_per_epoch * batch_size]    # skip incomplete batch
+    perms = perms.reshape((steps_per_epoch, batch_size))
+    batch_metrics = []
+    for perm in tqdm(perms):
+        batch = {k: v[perm, ...] for k, v in train_ds.items()}
+        state, metrics = train_step(state, batch)
+        batch_metrics.append(metrics)
+        # compute mean of metrics across each batch in epoch.
+        batch_metrics_np = jax.device_get(batch_metrics)
+        epoch_metrics_np = {
+            k: np.mean([metrics[k] for metrics in batch_metrics_np])
+            for k in batch_metrics_np[0]
+        }
+    print('train epoch: %d, loss: %.4f, accuracy: %.2f' % (
+        epoch, epoch_metrics_np['loss'], epoch_metrics_np['accuracy'] * 100
+    ))
+    return state
+# Eval function
+def eval_model(params, test_ds):
+    metrics = eval_step(params, test_ds)
+    metrics = jax.device_get(metrics)
+    summary = jax.tree_map(lambda x: x.item(), metrics)
+    return summary['loss'], summary['accuracy']
+if __name__ == '__main__':
+    train_ds, test_ds = get_datasets()
+    rng = jax.random.PRNGKey(0)
+    rng, init_rng = jax.random.split(rng)
+    # Build learning rate decay as Neural ODE paper
+    learning_rate = 0.0001
+    state = create_train_state(init_rng, learning_rate)
+    del init_rng  # Must not be used anymore.
+    num_epochs = 20
+    batch_size = 128
+    for epoch in tqdm(range(1, num_epochs + 1)):
+        rng, input_rng = jax.random.split(rng)
+        state = train_epoch(state, train_ds, batch_size, epoch, input_rng)
+        test_loss, test_accuracy = eval_model(state.params, test_ds)
+        print(' test epoch: %d, loss: %.2f, accuracy: %.2f' % (
+            epoch, test_loss, test_accuracy * 100
+        ))

jax_cnn_ode.py ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import argparse
+import train_ode
+import train_resnet
+def main(args):
+    if args.model == 'odenet':
+        train_ode.train_and_evaluate(args.lr, args.n_epoch, args.batch_size, args.tol)
+    else:
+        train_resnet.train_and_evaluate(args.lr, args.n_epoch, args.batch_size)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='main.py')
+    parser.add_argument("--model", type=str, choices=['odenet', 'resnet'], default="odenet", help="Type of model")
+    parser.add_argument("--tol", type=float, default=1e-1,
+                        help="Error tolerance for ODE solver. This only works with odenet")
+    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
+    parser.add_argument("--n_epoch", type=int, default=10, help="Total number of epoch")
+    parser.add_argument("--batch_size", type=int, default=32, help="Number of images in batch")
+    args = parser.parse_args()
+    main(args)

mlp.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import jax
+from typing import Any, Callable, Sequence, Optional
+from jax import lax, random, numpy as jnp
+import flax
+from flax.training import train_state
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+class ExplicitMLP(nn.Module):
+    features: Sequence[int]
+    def setup(self):
+        self.layers = [nn.Dense(feat) for feat in self.features]
+    def __call__(self, inputs):
+        x = inputs
+        for i, lyr in enumerate(self.layers):
+            x = lyr(x)
+            if i != len(self.layers) - 1:
+                x = nn.relu(x)
+        return x
+class SimpleMLP(nn.Module):
+    features: Sequence[int]
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        for i, feat in enumerate(self.features):
+            x = nn.Dense(feat)(x)
+            if i != len(self.features - 1):
+                x = nn.relu(x)
+        return x
+if __name__ == '__main__':
+    key1, key2 = random.split(random.PRNGKey(0), 2)
+    # Set problem dimensions
+    nsamples = 20
+    xdim = 10
+    ydim = 5
+    # Generate true W and b
+    W = random.normal(key1, (xdim, ydim))
+    b = random.normal(key2, (ydim,))
+    true_params = freeze({'params': {'bias': b, 'kernel': W}})
+    # Generate samples with additional noise
+    ksample, knoise = random.split(key1)
+    x_samples = random.normal(ksample, (nsamples, xdim))
+    y_samples = jnp.dot(x_samples, W) + b
+    y_samples += 0.1 * random.normal(knoise, (nsamples, ydim))  # Adding noise
+    print('x shape:', x_samples.shape, '; y shape:', y_samples.shape)
+    key_init, subkey = random.split(ksample, 2)
+    model = ExplicitMLP(features=[5])
+    params = model.init(subkey, x_samples)
+    def make_mse_func(x_batched, y_batched):
+        def mse(params):
+            # Define the squared loss for a single pair (x,y)
+            def squared_error(x, y):
+                pred = model.apply(params, x)
+                return jnp.inner(y - pred, y - pred) / 2.0
+            # We vectorize the previous to compute the average of the loss on all samples.
+            return jnp.mean(jax.vmap(squared_error)(x_batched, y_batched), axis=0)
+        return jax.jit(mse)  # And finally we jit the result.
+    # Get the sampled loss
+    loss = make_mse_func(x_samples, y_samples)
+    lr = 0.3
+    tx = optax.sgd(learning_rate=lr)
+    opt_state = tx.init(params)
+    loss_grad_fn = jax.value_and_grad(loss)
+    for i in range(101):
+        loss_val, grads = loss_grad_fn(params)
+        updates, opt_state = tx.update(grads, opt_state)
+        params = optax.apply_updates(params, updates)
+        if i % 10 == 0:
+            print('Loss step {}: '.format(i), loss_val)
+    # Serializing the result
+    bytes_output = serialization.to_bytes(params)
+    dict_output = serialization.to_state_dict(params)
+    print('Dict output')
+    print(dict_output)
+    print('Bytes output')
+    print(bytes_output)
+    # Restore the parameter from the saved one
+    saved_params = serialization.from_bytes(params, bytes_output)
+    print(loss(saved_params))
+    print(loss(params))

ode.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAX-based Dormand-Prince ODE integration with adaptive stepsize.
+Integrate systems of ordinary differential equations (ODEs) using the JAX
+autograd/diff library and the Dormand-Prince method for adaptive integration
+stepsize calculation. Provides improved integration accuracy over fixed
+stepsize integration methods.
+For details of the mixed 4th/5th order Runge-Kutta integration method, see
+https://doi.org/10.1090/S0025-5718-1986-0815836-3
+Adjoint algorithm based on Appendix C of https://arxiv.org/pdf/1806.07366.pdf
+"""
+from functools import partial
+import operator as op
+import jax
+import jax.numpy as jnp
+from jax import core
+from jax import custom_derivatives
+from jax import lax
+from jax._src.util import safe_map, safe_zip
+from jax.flatten_util import ravel_pytree
+from jax.tree_util import tree_map
+from jax import linear_util as lu
+map = safe_map
+zip = safe_zip
+def ravel_first_arg(f, unravel):
+  return ravel_first_arg_(lu.wrap_init(f), unravel).call_wrapped
+@lu.transformation
+def ravel_first_arg_(unravel, y_flat, *args):
+  y = unravel(y_flat)
+  ans = yield (y,) + args, {}
+  ans_flat, _ = ravel_pytree(ans)
+  yield ans_flat
+def interp_fit_dopri(y0, y1, k, dt):
+  # Fit a polynomial to the results of a Runge-Kutta step.
+  dps_c_mid = jnp.array([
+      6025192743 / 30085553152 / 2, 0, 51252292925 / 65400821598 / 2,
+      -2691868925 / 45128329728 / 2, 187940372067 / 1594534317056 / 2,
+      -1776094331 / 19743644256 / 2, 11237099 / 235043384 / 2])
+  y_mid = y0 + dt * jnp.dot(dps_c_mid, k)
+  return jnp.asarray(fit_4th_order_polynomial(y0, y1, y_mid, k[0], k[-1], dt))
+def fit_4th_order_polynomial(y0, y1, y_mid, dy0, dy1, dt):
+  a = -2.*dt*dy0 + 2.*dt*dy1 -  8.*y0 -  8.*y1 + 16.*y_mid
+  b =  5.*dt*dy0 - 3.*dt*dy1 + 18.*y0 + 14.*y1 - 32.*y_mid
+  c = -4.*dt*dy0 +    dt*dy1 - 11.*y0 -  5.*y1 + 16.*y_mid
+  d = dt * dy0
+  e = y0
+  return a, b, c, d, e
+def initial_step_size(fun, t0, y0, order, rtol, atol, f0):
+  # Algorithm from:
+  # E. Hairer, S. P. Norsett G. Wanner,
+  # Solving Ordinary Differential Equations I: Nonstiff Problems, Sec. II.4.
+  scale = atol + jnp.abs(y0) * rtol
+  d0 = jnp.linalg.norm(y0 / scale)
+  d1 = jnp.linalg.norm(f0 / scale)
+  h0 = jnp.where((d0 < 1e-5) | (d1 < 1e-5), 1e-6, 0.01 * d0 / d1)
+  y1 = y0 + h0 * f0
+  f1 = fun(y1, t0 + h0)
+  d2 = jnp.linalg.norm((f1 - f0) / scale) / h0
+  h1 = jnp.where((d1 <= 1e-15) & (d2 <= 1e-15),
+                jnp.maximum(1e-6, h0 * 1e-3),
+                (0.01 / jnp.max(d1 + d2)) ** (1. / (order + 1.)))
+  return jnp.minimum(100. * h0, h1)
+def runge_kutta_step(func, y0, f0, t0, dt):
+  # Dopri5 Butcher tableaux
+  alpha = jnp.array([1 / 5, 3 / 10, 4 / 5, 8 / 9, 1., 1., 0])
+  beta = jnp.array([
+      [1 / 5, 0, 0, 0, 0, 0, 0],
+      [3 / 40, 9 / 40, 0, 0, 0, 0, 0],
+      [44 / 45, -56 / 15, 32 / 9, 0, 0, 0, 0],
+      [19372 / 6561, -25360 / 2187, 64448 / 6561, -212 / 729, 0, 0, 0],
+      [9017 / 3168, -355 / 33, 46732 / 5247, 49 / 176, -5103 / 18656, 0, 0],
+      [35 / 384, 0, 500 / 1113, 125 / 192, -2187 / 6784, 11 / 84, 0]
+  ])
+  c_sol = jnp.array([35 / 384, 0, 500 / 1113, 125 / 192, -2187 / 6784, 11 / 84, 0])
+  c_error = jnp.array([35 / 384 - 1951 / 21600, 0, 500 / 1113 - 22642 / 50085,
+                      125 / 192 - 451 / 720, -2187 / 6784 - -12231 / 42400,
+                      11 / 84 - 649 / 6300, -1. / 60.])
+  def body_fun(i, k):
+    ti = t0 + dt * alpha[i-1]
+    yi = y0 + dt * jnp.dot(beta[i-1, :], k)
+    ft = func(yi, ti)
+    return k.at[i, :].set(ft)
+  k = jnp.zeros((7, f0.shape[0]), f0.dtype).at[0, :].set(f0)
+  k = lax.fori_loop(1, 7, body_fun, k)
+  y1 = dt * jnp.dot(c_sol, k) + y0
+  y1_error = dt * jnp.dot(c_error, k)
+  f1 = k[-1]
+  return y1, f1, y1_error, k
+def abs2(x):
+  if jnp.iscomplexobj(x):
+    return x.real ** 2 + x.imag ** 2
+  else:
+    return x ** 2
+def error_ratio(error_estimate, rtol, atol, y0, y1):
+  err_tol = atol + rtol * jnp.maximum(jnp.abs(y0), jnp.abs(y1))
+  err_ratio = error_estimate / err_tol
+  return jnp.mean(abs2(err_ratio))
+def optimal_step_size(last_step, mean_error_ratio, safety=0.9, ifactor=10.0,
+                      dfactor=0.2, order=5.0):
+  """Compute optimal Runge-Kutta stepsize."""
+  mean_error_ratio = jnp.max(mean_error_ratio)
+  dfactor = jnp.where(mean_error_ratio < 1, 1.0, dfactor)
+  err_ratio = jnp.sqrt(mean_error_ratio)
+  factor = jnp.maximum(1.0 / ifactor,
+                      jnp.minimum(err_ratio**(1.0 / order) / safety, 1.0 / dfactor))
+  return jnp.where(mean_error_ratio == 0, last_step * ifactor, last_step / factor)
+def odeint(func, y0, t, *args, rtol=1.4e-8, atol=1.4e-8, mxstep=jnp.inf):
+  """Adaptive stepsize (Dormand-Prince) Runge-Kutta odeint implementation.
+  Args:
+    func: function to evaluate the time derivative of the solution `y` at time
+      `t` as `func(y, t, *args)`, producing the same shape/structure as `y0`.
+    y0: array or pytree of arrays representing the initial value for the state.
+    t: array of float times for evaluation, like `jnp.linspace(0., 10., 101)`,
+      in which the values must be strictly increasing.
+    *args: tuple of additional arguments for `func`, which must be arrays
+      scalars, or (nested) standard Python containers (tuples, lists, dicts,
+      namedtuples, i.e. pytrees) of those types.
+    rtol: float, relative local error tolerance for solver (optional).
+    atol: float, absolute local error tolerance for solver (optional).
+    mxstep: int, maximum number of steps to take for each timepoint (optional).
+  Returns:
+    Values of the solution `y` (i.e. integrated system values) at each time
+    point in `t`, represented as an array (or pytree of arrays) with the same
+    shape/structure as `y0` except with a new leading axis of length `len(t)`.
+  """
+  def _check_arg(arg):
+    if not isinstance(arg, core.Tracer) and not core.valid_jaxtype(arg):
+      msg = ("The contents of odeint *args must be arrays or scalars, but got "
+             "\n{}.")
+      raise TypeError(msg.format(arg))
+  converted, consts = custom_derivatives.closure_convert(func, y0, t[0], *args)
+  return _odeint_wrapper(converted, rtol, atol, mxstep, y0, t, *args, *consts)
+@partial(jax.jit, static_argnums=(0, 1, 2, 3))
+def _odeint_wrapper(func, rtol, atol, mxstep, y0, ts, *args):
+  y0, unravel = ravel_pytree(y0)
+  func = ravel_first_arg(func, unravel)
+  out = _odeint(func, rtol, atol, mxstep, y0, ts, *args)
+  return jax.vmap(unravel)(out)
+@partial(jax.custom_vjp, nondiff_argnums=(0, 1, 2, 3))
+def _odeint(func, rtol, atol, mxstep, y0, ts, *args):
+  func_ = lambda y, t: func(y, t, *args)
+  def scan_fun(carry, target_t):
+    def cond_fun(state):
+      i, _, _, t, dt, _, _ = state
+      return (t < target_t) & (i < mxstep) & (dt > 0)
+    def body_fun(state):
+      i, y, f, t, dt, last_t, interp_coeff = state
+      next_y, next_f, next_y_error, k = runge_kutta_step(func_, y, f, t, dt)
+      next_t = t + dt
+      error_ratios = error_ratio(next_y_error, rtol, atol, y, next_y)
+      new_interp_coeff = interp_fit_dopri(y, next_y, k, dt)
+      dt = optimal_step_size(dt, error_ratios)
+      new = [i + 1, next_y, next_f, next_t, dt,      t, new_interp_coeff]
+      old = [i + 1,      y,      f,      t, dt, last_t,     interp_coeff]
+      return map(partial(jnp.where, jnp.all(error_ratios <= 1.)), new, old)
+    _, *carry = lax.while_loop(cond_fun, body_fun, [0] + carry)
+    _, _, t, _, last_t, interp_coeff = carry
+    relative_output_time = (target_t - last_t) / (t - last_t)
+    y_target = jnp.polyval(interp_coeff, relative_output_time)
+    return carry, y_target
+  # ODEfunc with NFE counter will give auxilarly output for nfe.
+  # Below code is modified to skip that output.
+  f0 = func_(y0, ts[0])
+  dt = initial_step_size(func_, ts[0], y0, 4, rtol, atol, f0)
+  interp_coeff = jnp.array([y0] * 5)
+  init_carry = [y0, f0, ts[0], dt, ts[0], interp_coeff]
+  _, ys = lax.scan(scan_fun, init_carry, ts[1:])
+  return jnp.concatenate((y0[None], ys))
+def _odeint_fwd(func, rtol, atol, mxstep, y0, ts, *args):
+  ys = _odeint(func, rtol, atol, mxstep, y0, ts, *args)
+  return ys, (ys, ts, args)
+def _odeint_rev(func, rtol, atol, mxstep, res, g):
+  ys, ts, args = res
+  def aug_dynamics(augmented_state, t, *args):
+    """Original system augmented with vjp_y, vjp_t and vjp_args."""
+    y, y_bar, *_ = augmented_state
+    # `t` here is negatice time, so we need to negate again to get back to
+    # normal time. See the `odeint` invocation in `scan_fun` below.
+    y_dot, vjpfun = jax.vjp(func, y, -t, *args)
+    return (-y_dot, *vjpfun(y_bar))
+  y_bar = g[-1]
+  ts_bar = []
+  t0_bar = 0.
+  def scan_fun(carry, i):
+    y_bar, t0_bar, args_bar = carry
+    # Compute effect of moving measurement time
+    # `t_bar` should not be complex as it represents time
+    t_bar = jnp.dot(func(ys[i], ts[i], *args), g[i]).real
+    t0_bar = t0_bar - t_bar
+    # Run augmented system backwards to previous observation
+    _, y_bar, t0_bar, args_bar = odeint(
+        aug_dynamics, (ys[i], y_bar, t0_bar, args_bar),
+        jnp.array([-ts[i], -ts[i - 1]]),
+        *args, rtol=rtol, atol=atol, mxstep=mxstep)
+    y_bar, t0_bar, args_bar = tree_map(op.itemgetter(1), (y_bar, t0_bar, args_bar))
+    # Add gradient from current output
+    y_bar = y_bar + g[i - 1]
+    return (y_bar, t0_bar, args_bar), t_bar
+  init_carry = (g[-1], 0., tree_map(jnp.zeros_like, args))
+  (y_bar, t0_bar, args_bar), rev_ts_bar = lax.scan(
+      scan_fun, init_carry, jnp.arange(len(ts) - 1, 0, -1))
+  ts_bar = jnp.concatenate([jnp.array([t0_bar]), rev_ts_bar[::-1]])
+  return (y_bar, ts_bar, *args_bar)
+_odeint.defvjp(_odeint_fwd, _odeint_rev)

opts.py ADDED Viewed

File without changes

train.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from functools import partial
+import jax
+from typing import Any, Callable, Sequence, Optional, NewType
+from jax import lax, random, vmap, numpy as jnp
+from jax.experimental.ode import odeint
+import flax
+from flax.training import train_state
+from flax import traverse_util
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+import tensorflow_datasets as tfds
+import numpy as np
+from tqdm import tqdm
+import os
+# Define loss
+@jax.jit
+def cross_entropy_loss(logits, labels):
+    one_hot_labels = jax.nn.one_hot(labels, num_classes=10)
+    return -jnp.mean(jnp.sum(one_hot_labels * logits, axis=-1))
+# Metric computation
+@jax.jit
+def compute_metrics(logits, labels):
+    loss = cross_entropy_loss(logits=logits, labels=labels)
+    accuracy = jnp.mean(jnp.argmax(logits, -1) == labels)
+    metrics = {
+        'loss': loss,
+        'accuracy': accuracy,
+    }
+    return metrics
+def get_datasets():
+    """Load MNIST train and test datasets into memory."""
+    ds_builder = tfds.builder('mnist')
+    ds_builder.download_and_prepare()
+    train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1))
+    test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1))
+    train_ds['image'] = jnp.float32(train_ds['image']) / 255.
+    test_ds['image'] = jnp.float32(test_ds['image']) / 255.
+    return train_ds, test_ds
+def create_train_state(model, rng, learning_rate):
+    """Creates initial 'TrainState'."""
+    params = model.init(rng, jnp.ones([1, 28, 28, 1]))['params']
+    tx = optax.adam(learning_rate)
+    return train_state.TrainState.create(
+        apply_fn=model.apply, params=params, tx=tx
+    )
+# Training step
+@jax.jit
+def train_step(state, batch):
+    """Train for a single step."""
+    def loss_fn(params):
+        logits = apply({'params': params}, batch['image'])
+        loss = cross_entropy_loss(logits=logits, labels=batch['label'])
+        return loss, logits
+    grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
+    (_, logits), grads = grad_fn(state.params)
+    state = state.apply_gradients(grads=grads)
+    metrics = compute_metrics(logits=logits, labels=batch['label'])
+    return state, metrics
+# Evaluation step
+@jax.jit
+def eval_step(params, batch):
+    logits = apply({'params': params}, batch['image'])
+    return compute_metrics(logits=logits, labels=batch['label'])
+# Train function
+def train_epoch(model, state, train_ds, batch_size, epoch, rng):
+    """Train for a single epoch"""
+    train_ds_size = len(train_ds['image'])
+    steps_per_epoch = train_ds_size // batch_size
+    perms = jax.random.permutation(rng, len(train_ds['image']))
+    perms = perms[:steps_per_epoch * batch_size]    # skip incomplete batch
+    perms = perms.reshape((steps_per_epoch, batch_size))
+    batch_metrics = []
+    for perm in tqdm(perms):
+        batch = {k: v[perm, ...] for k, v in train_ds.items()}
+        state, metrics = train_step(model, state, batch)
+        batch_metrics.append(metrics)
+        # compute mean of metrics across each batch in epoch.
+        batch_metrics_np = jax.device_get(batch_metrics)
+        epoch_metrics_np = {
+            k: np.mean([metrics[k] for metrics in batch_metrics_np])
+            for k in batch_metrics_np[0]
+        }
+    print('train epoch: %d, loss: %.4f, accuracy: %.2f' % (
+        epoch, epoch_metrics_np['loss'], epoch_metrics_np['accuracy'] * 100
+    ))
+    return state
+# Eval function
+def eval_model(params, test_ds):
+    metrics = eval_step(params, test_ds)
+    metrics = jax.device_get(metrics)
+    summary = jax.tree_map(lambda x: x.item(), metrics)
+    return summary['loss'], summary['accuracy']
+def train_and_evaluate(learning_rate, n_epoch, batch_size):
+    train_ds, test_ds = get_datasets()
+    rng = jax.random.PRNGKey(0)
+    rng, init_rng = jax.random.split(rng)
+    state = create_train_state(init_rng, learning_rate)
+    del init_rng  # Must not be used anymore.
+    for epoch in tqdm(range(1, n_epoch + 1)):
+        rng, input_rng = jax.random.split(rng)
+        state = train_epoch(state, train_ds, batch_size, epoch, input_rng)
+        test_loss, test_accuracy = eval_model(state.params, test_ds)
+        print(' test epoch: %d, loss: %.2f, accuracy: %.2f' % (
+            epoch, test_loss, test_accuracy * 100
+        ))

train_cnf.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import glob
+from PIL import Image
+from functools import partial
+import jax
+from typing import Any, Callable, Sequence, Optional, NewType
+from jax import lax, random, vmap, scipy, numpy as jnp
+# from jax.experimental.ode import odeint
+from models.ode import odeint
+import flax
+from flax.training import train_state
+from flax import traverse_util
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+from sklearn.datasets import make_circles
+from tqdm import tqdm
+# os.environ['TF_FORCE_UNIFIED_MEMORY'] = '1'
+# os.environ['XLA_PYTHON_CLIENT_ALLOCATOR'] = 'platform'
+# os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
+class HyperNetwork(nn.Module):
+    """Hyper-network allowing f(z(t), t) to change with time.
+    Adapted from the Pytorch implementation at:
+    https://github.com/rtqichen/torchdiffeq/blob/master/examples/cnf.py
+    """
+    in_out_dim: Any = 2
+    hidden_dim: Any = 32
+    width: Any = 64
+    @nn.compact
+    def __call__(self, t):
+        # predict params
+        blocksize = self.width * self.in_out_dim
+        params = lax.expand_dims(t, (0, 1))
+        params = nn.Dense(self.hidden_dim)(params)
+        params = nn.tanh(params)
+        params = nn.Dense(self.hidden_dim)(params)
+        params = nn.tanh(params)
+        params = nn.Dense(3 * blocksize + self.width)(params)
+        # restructure
+        params = lax.reshape(params, (3 * blocksize + self.width,))
+        W = lax.reshape(params[:blocksize], (self.width, self.in_out_dim, 1))
+        U = lax.reshape(params[blocksize:2 * blocksize], (self.width, 1, self.in_out_dim))
+        G = lax.reshape(params[2 * blocksize:3 * blocksize], (self.width, 1, self.in_out_dim))
+        U = U * nn.sigmoid(G)
+        B = lax.expand_dims(params[3 * blocksize:], (1, 2))
+        return W, B, U
+class CNF(nn.Module):
+    """Adapted from the Pytorch implementation at:
+    https://github.com/rtqichen/torchdiffeq/blob/master/examples/cnf.py
+    """
+    in_out_dim: Any = 2
+    hidden_dim: Any = 32
+    width: Any = 64
+    @nn.compact
+    def __call__(self, t, states):
+        z, logp_z = states[..., :2], states[..., 2:]
+        W, B, U = HyperNetwork(self.in_out_dim, self.hidden_dim, self.width)(t)
+        # TODO Below should be converted using vmap
+        def dzdt(z):
+            Z = lax.expand_dims(z, (0,))
+            Z = jnp.repeat(Z, self.width, 0)
+            h = nn.tanh(jnp.matmul(Z, W) + B)
+            return jnp.matmul(h, U).mean(0)
+        dz_dt = dzdt(z)
+        sum_dzdt = lambda z: jnp.sum(dzdt(z), 1)
+        df_dz = jax.jacrev(sum_dzdt)(z)
+        dlogp_z_dt = -1.0 * jnp.trace(df_dz, 0, 1, 2)
+        return lax.concatenate((dz_dt, lax.expand_dims(dlogp_z_dt, (1,))), 1)
+class Neg_CNF(nn.Module):
+    """Negative CNF for jax's odeint."""
+    in_out_dim: Any = 2
+    hidden_dim: Any = 32
+    width: Any = 64
+    @nn.compact
+    def __call__(self, t, states):
+        outputs = CNF(self.in_out_dim, self.hidden_dim, self.width)(-1.0 * t, states)
+        return -1.0 * outputs
+def get_batch(num_samples):
+    """Adapted from the Pytorch implementation at:
+    https://github.com/rtqichen/torchdiffeq/blob/master/examples/cnf.py
+    """
+    points, _ = make_circles(n_samples=num_samples, noise=0.06, factor=0.5)
+    x = jnp.array(points, dtype=jnp.float32)
+    logp_diff_t1 = jnp.zeros((num_samples, 1), dtype=jnp.float32)
+    return lax.concatenate((x, logp_diff_t1), 1)
+def create_train_state(rng, learning_rate, in_out_dim, hidden_dim, width):
+    """Creates initial 'TrainState'."""
+    inputs = get_batch(10)
+    neg_cnf = CNF(in_out_dim, hidden_dim, width)
+    params = neg_cnf.init(rng, jnp.array(10.), inputs)['params']
+    # set_params(params)
+    tx = optax.adam(learning_rate)
+    return train_state.TrainState.create(
+        apply_fn=neg_cnf.apply, params=params, tx=tx
+    )
+def set_params(params):
+    # Convert all value of Params to certain constant
+    params = unfreeze(params)
+    # Get flattened-key: value list.
+    flat_params = {'/'.join(k): v for k, v in traverse_util.flatten_dict(params).items()}
+    unflat_params = traverse_util.unflatten_dict({tuple(k.split('/')): 0.2 * jnp.ones_like(v) for k, v in flat_params.items()})
+    new_params = freeze(unflat_params)
+    test_x = jnp.array([[0., 1.], [2., 3.]])
+    test_log_p = jnp.zeros(2, 1)
+    test_inputs = lax.concatenate((test_x, test_log_p), 1)
+    Neg_CNF().apply({'params': new_params}, jnp.array(0.), test_inputs)
+# @partial(jax.jit, static_argnums=(2, 3, 4, 5, 6))
+def train_step(state, batch, in_out_dim, hidden_dim, width, t0, t1):
+    p_z0 = lambda x: scipy.stats.multivariate_normal.logpdf(x,
+                                                            mean=jnp.array([0., 0.]),
+                                                            cov=jnp.array([[0.1, 0.], [0., 0.1]]))
+    def loss_fn(params):
+        func = lambda states, t: Neg_CNF(in_out_dim, hidden_dim, width).apply({'params': params}, t, states)
+        outputs = odeint(
+            func,
+            batch,
+            -1.0 * jnp.array([t1, t0]),
+            atol=1e-5,
+            rtol=1e-5
+        )
+        z_t, logp_diff_t = outputs[..., :2], outputs[..., 2:]
+        z_t0, logp_diff_t0 = z_t[-1], logp_diff_t[-1]
+        logp_x = p_z0(z_t0) - lax.squeeze(logp_diff_t0, dimensions=(1,))
+        loss = -logp_x.mean(0)
+        return loss
+    grad_fn = jax.value_and_grad(loss_fn)
+    loss, grads = grad_fn(state.params)
+    state = state.apply_gradients(grads=grads)
+    return state, loss
+def train(learning_rate, n_iters, batch_size, in_out_dim, hidden_dim, width, t0, t1, visual):
+    """Train the model."""
+    rng = jax.random.PRNGKey(0)
+    state = create_train_state(rng, learning_rate, in_out_dim, hidden_dim, width)
+    for itr in range(1, n_iters+1):
+        batch = get_batch(batch_size)
+        state, loss = train_step(state, batch, in_out_dim, hidden_dim, width, t0, t1)
+        print("iter: %d, loss: %.2f" % (itr, loss))
+    if visual is True:
+        # Convert Params of Neg_CNF to CNF
+        neg_params = state.params
+        neg_params = unfreeze(neg_params)
+        # Get flattened-key: value list.
+        neg_flat_params = {'/'.join(k): v for k, v in traverse_util.flatten_dict(neg_params).items()}
+        pos_flat_params = {key[6:]: jnp.array(np.array(neg_flat_params[key])) for key in list(neg_flat_params.keys())}
+        pos_unflat_params = traverse_util.unflatten_dict({tuple(k.split('/')): v for k, v in pos_flat_params.items()})
+        pos_params = freeze(pos_unflat_params)
+        output = viz(neg_params, pos_params, in_out_dim, hidden_dim, width, t0, t1)
+        z_t_samples, z_t_density, logp_diff_t, viz_timesteps, target_sample, z_t1 = output
+        create_plots(z_t_samples, z_t_density, logp_diff_t, t0, t1, viz_timesteps, target_sample, z_t1)
+def solve_dynamics(dynamics_fn, initial_state, t):
+    def f(initial_state, t):
+        return odeint(dynamics_fn, initial_state, t, atol=1e-5, rtol=1e-5)
+    return f(initial_state, t)
+def viz(neg_params, pos_params, in_out_dim, hidden_dim, width, t0, t1):
+    """Adapted from PyTorch """
+    viz_samples = 5000
+    viz_timesteps = 2
+    target_sample, _ = get_batch(viz_samples)
+    if not os.path.exists('results/'):
+        os.makedirs('results/')
+    z_t0 = jnp.array(np.random.multivariate_normal(mean=np.array([0., 0.]),
+                                                   cov=np.array([[0.1, 0.], [0., 0.1]]),
+                                                   size=viz_samples))
+    logp_diff_t0 = jnp.zeros((viz_samples, 1), dtype=jnp.float32)
+    func_pos = lambda states, t: CNF(in_out_dim, hidden_dim, width).apply({'params': pos_params}, t, states)
+    z_t_samples, _ = solve_dynamics(func_pos, (z_t0, logp_diff_t0), jnp.linspace(t0, t1, viz_timesteps))
+    # Generate evolution of density
+    x = jnp.linspace(-1.5, 1.5, 100)
+    y = jnp.linspace(-1.5, 1.5, 100)
+    points = np.vstack(jnp.meshgrid(x, y)).reshape([2, -1]).T
+    z_t1 = jnp.array(points, dtype=jnp.float32)
+    logp_diff_t1 = jnp.zeros((z_t1.shape[0], 1), dtype=jnp.float32)
+    func_neg = lambda states, t: Neg_CNF(in_out_dim, hidden_dim, width).apply({'params': neg_params}, -t, states)
+    z_t_density, logp_diff_t = solve_dynamics(func_neg, (z_t1, logp_diff_t1), -jnp.linspace(t1, t0, viz_timesteps))
+    return z_t_samples, z_t_density, logp_diff_t, viz_timesteps, target_sample, z_t1
+def create_plots(z_t_samples, z_t_density, logp_diff_t, t0, t1, viz_timesteps, target_sample, z_t1):
+    # Create plots for each timestep
+    for (t, z_sample, z_density, logp_diff) in zip(
+            tqdm(np.linspace(t0, t1, viz_timesteps)),
+            z_t_samples, z_t_density, logp_diff_t
+    ):
+        fig = plt.figure(figsize=(12, 4), dpi=200)
+        plt.tight_layout()
+        plt.axis('off')
+        plt.margins(0, 0)
+        fig.suptitle(f'{t:.2f}s')
+        ax1 = fig.add_subplot(1, 3, 1)
+        ax1.set_title('Target')
+        ax1.get_xaxis().set_ticks([])
+        ax1.get_yaxis().set_ticks([])
+        ax2 = fig.add_subplot(1, 3, 2)
+        ax2.set_title('Samples')
+        ax2.get_xaxis().set_ticks([])
+        ax2.get_yaxis().set_ticks([])
+        ax3 = fig.add_subplot(1, 3, 3)
+        ax3.set_title('Log Probability')
+        ax3.get_xaxis().set_ticks([])
+        ax3.get_yaxis().set_ticks([])
+        ax1.hist2d(*jnp.transpose(target_sample), bins=300, density=True,
+                   range=[[-1.5, 1.5], [-1.5, 1.5]])
+        ax2.hist2d(*jnp.transpose(z_sample), bins=300, density=True,
+                   range=[[-1.5, 1.5], [-1.5, 1.5]])
+        p_z0 = lambda x: scipy.stats.multivariate_normal.logpdf(x,
+                                                                mean=jnp.array([0., 0.]),
+                                                                cov=jnp.array([[0.1, 0.], [0., 0.1]]))
+        logp = p_z0(z_density) - lax.reshape(logp_diff, (z_density.shape[0]))
+        ax3.tricontourf(*jnp.transpose(z_t1),
+                        jnp.exp(logp), 200)
+        plt.savefig(os.path.join('results/', f"cnf-viz-{int(t * 1000):05d}.jpg"),
+                    pad_inches=0.2, bbox_inches='tight')
+        plt.close()
+    img, *imgs = [Image.open(f) for f in sorted(glob.glob(os.path.join('results/', f"cnf-viz-*.jpg")))]
+    img.save(fp=os.path.join('results/', "cnf-viz.gif"), format='GIF', append_images=imgs,
+             save_all=True, duration=250, loop=0)
+    print('Saved visualization animation at {}'.format(os.path.join('results/', "cnf-viz.gif")))
+if __name__ == '__main__':
+    train(0.001, 100, 512, 2, 32, 64, 0., 10., True)

train_ode.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from functools import partial
+import jax
+from typing import Any, Callable, Sequence, Optional, NewType
+from jax import lax, random, vmap, numpy as jnp
+from jax.experimental.ode import odeint
+from jax.experimental import host_callback
+import flax
+from flax.training import train_state
+from flax import traverse_util
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+import tensorflow_datasets as tfds
+import numpy as np
+from tqdm import tqdm
+import os
+# Define Residual Block
+class ResDownBlock(nn.Module):
+    """Single ResBlock w/ downsample"""
+    dim_out: Any = 64
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(x))
+        x = nn.Conv(features=self.dim_out, kernel_size=(1, 1), strides=(2, 2))(x)
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(3, 3), strides=(2, 2))(f_x)
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(f_x))
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(3, 3))(f_x)
+        x = f_x + x
+        return x
+class ConcatConv2D(nn.Module):
+    """Concat dynamics to hidden layer"""
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, inputs, t):
+        x = inputs
+        tt = jnp.ones_like(x[..., :1]) * t
+        ttx = jnp.concatenate([tt, x], -1)
+        return nn.Conv(features=self.dim_out, kernel_size=(self.ksize, self.ksize))(ttx)
+# Define Neural ODE for mnist example.
+class ODEfunc(nn.Module):
+    """ODE function which replace ResNet"""
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, inputs, t):
+        # TODO Count number of function estimation
+        host_callback.call(nfecounter.count, 1)
+        x = inputs
+        out = nn.GroupNorm(self.dim_out)(x)
+        out = nn.relu(out)
+        out = ConcatConv2D(self.dim_out, self.ksize)(out, t)
+        out = nn.GroupNorm(self.dim_out)(out)
+        out = nn.relu(out)
+        out = ConcatConv2D(self.dim_out, self.ksize)(out, t)
+        out = nn.GroupNorm(self.dim_out)(out)
+        return out
+class NFEcounter:
+    def __init__(self, init_nfe):
+        self.nfe = init_nfe
+    def count(self, increase):
+        self.nfe += increase
+    def set(self, target):
+        self.nfe = target
+# Define NFE counter
+nfecounter = NFEcounter(0)
+class ODEBlock(nn.Module):
+    """ODE block which contains odeint"""
+    tol: Any = 1.
+    @nn.compact
+    def __call__(self, inputs, params):
+        ode_func = ODEfunc()
+        ode_func_apply = lambda x, t: ode_func.apply(variables={'params': params}, inputs=x, t=t)
+        init_state, final_state = odeint(ode_func_apply,
+                                         inputs, jnp.array([0., 1.]),
+                                         rtol=self.tol, atol=self.tol)
+        return final_state
+class ODEBlockVmap(nn.Module):
+    """Apply vmap to ODEBlock"""
+    tol: Any = 1.
+    @nn.compact
+    def __call__(self, inputs, params):
+        x = inputs
+        vmap_odeblock = nn.vmap(ODEBlock,
+                                variable_axes={'params': 0},
+                                split_rngs={'params': True},
+                                in_axes=(0, None))
+        return vmap_odeblock(tol=self.tol, name='odeblock')(x, params)
+class FullODENet(nn.Module):
+    """Full ODE net which contains two downsampling layers, ODE block and linear classifier."""
+    dim_out: Any = 64
+    ksize: Any = 3
+    tol: Any = 1.
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        x = nn.Conv(features=self.dim_out, kernel_size=(self.ksize, self.ksize))(x)
+        x = ResDownBlock()(x)
+        x = ResDownBlock()(x)
+        ode_func = ODEfunc()
+        init_fn = lambda rng, x: ode_func.init(random.split(rng)[-1], x, 0.)['params']
+        ode_func_params = self.param('ode_func', init_fn, jnp.ones_like(x[0]))
+        x = ODEBlockVmap(tol=self.tol)(x, ode_func_params)
+        x = nn.GroupNorm(self.dim_out)(x)
+        x = nn.relu(x)
+        x = nn.avg_pool(x, (1, 1))
+        x = x.reshape((x.shape[0], -1))     # flatten
+        x = nn.Dense(features=10)(x)
+        x = nn.log_softmax(x)
+        return x
+# Define loss
+@jax.jit
+def cross_entropy_loss(logits, labels):
+    one_hot_labels = jax.nn.one_hot(labels, num_classes=10)
+    return -jnp.mean(jnp.sum(one_hot_labels * logits, axis=-1))
+# Metric computation
+@jax.jit
+def compute_metrics(logits, labels, nfe_forward, nfe_backward):
+    loss = cross_entropy_loss(logits=logits, labels=labels)
+    accuracy = jnp.mean(jnp.argmax(logits, -1) == labels)
+    metrics = {
+        'loss': loss,
+        'accuracy': accuracy,
+        'nfe_forward': nfe_forward,
+        'nfe_backward': nfe_backward
+    }
+    return metrics
+def get_datasets():
+    """Load MNIST train and test datasets into memory."""
+    ds_builder = tfds.builder('mnist')
+    ds_builder.download_and_prepare()
+    train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1))
+    test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1))
+    train_ds['image'] = jnp.float32(train_ds['image']) / 255.
+    test_ds['image'] = jnp.float32(test_ds['image']) / 255.
+    return train_ds, test_ds
+def create_train_state(rng, learning_rate, tol):
+    """Creates initial 'TrainState'."""
+    odenet = FullODENet(tol=tol)
+    params = odenet.init(rng, jnp.ones([1, 28, 28, 1]))['params']
+    tx = optax.adam(learning_rate)
+    return train_state.TrainState.create(
+        apply_fn=odenet.apply, params=params, tx=tx
+    )
+# Training step
+@partial(jax.jit, static_argnums=(2,))
+def train_step(state, batch, tol):
+    """Train for a single step."""
+    def loss_fn(params):
+        logits = FullODENet(tol=tol).apply({'params': params}, batch['image'])
+        loss = cross_entropy_loss(logits=logits, labels=batch['label'])
+        return loss, logits
+    grad_fn = jax.grad(loss_fn, has_aux=True)
+    host_callback.call(nfecounter.set, 0)
+    (_, logits) = loss_fn(state.params)
+    nfe_forward = nfecounter.nfe
+    host_callback.call(nfecounter.set, 0)
+    grads, _ = grad_fn(state.params)
+    nfe_backward = nfecounter.nfe
+    state = state.apply_gradients(grads=grads)
+    metrics = compute_metrics(logits=logits, labels=batch['label'],
+                              nfe_forward=nfe_forward, nfe_backward=nfe_backward)
+    return state, metrics
+# Evaluation step
+@partial(jax.jit, static_argnums=(2,))
+def eval_step(params, batch, tol):
+    logits = FullODENet(tol=tol).apply({'params': params}, batch['image'])
+    return compute_metrics(logits=logits, labels=batch['label'], nfe_forward=0, nfe_backward=0)
+# Train function
+def train_epoch(state, train_ds, batch_size, epoch, rng, tol):
+    """Train for a single epoch"""
+    train_ds_size = len(train_ds['image'])
+    steps_per_epoch = train_ds_size // batch_size
+    perms = jax.random.permutation(rng, len(train_ds['image']))
+    perms = perms[:steps_per_epoch * batch_size]    # skip incomplete batch
+    perms = perms.reshape((steps_per_epoch, batch_size))
+    batch_metrics = []
+    for perm in tqdm(perms):
+        batch = {k: v[perm, ...] for k, v in train_ds.items()}
+        state, metrics = train_step(state, batch, tol)
+        batch_metrics.append(metrics)
+        # compute mean of metrics across each batch in epoch.
+        batch_metrics_np = jax.device_get(batch_metrics)
+        epoch_metrics_np = {
+            k: np.mean([metrics[k] for metrics in batch_metrics_np])
+            for k in batch_metrics_np[0]
+        }
+    print('train epoch: %d, loss: %.4f, accuracy: %.2f, nfe_forward: %d, nfe_backward: %d' % (
+        epoch, epoch_metrics_np['loss'], epoch_metrics_np['accuracy'] * 100,
+        epoch_metrics_np['nfe_forward'], epoch_metrics_np['nfe_backward']
+    ))
+    return state
+# Eval function
+def eval_model(params, test_ds, tol):
+    metrics = eval_step(params, test_ds, tol)
+    metrics = jax.device_get(metrics)
+    summary = jax.tree_map(lambda x: x.item(), metrics)
+    return summary['loss'], summary['accuracy']
+def train_and_evaluate(learning_rate, n_epoch, batch_size, tol):
+    train_ds, test_ds = get_datasets()
+    rng = jax.random.PRNGKey(0)
+    rng, init_rng = jax.random.split(rng)
+    state = create_train_state(init_rng, learning_rate, tol)
+    del init_rng  # Must not be used anymore.
+    for epoch in tqdm(range(1, n_epoch + 1)):
+        rng, input_rng = jax.random.split(rng)
+        state = train_epoch(state, train_ds, batch_size, epoch, input_rng, tol)
+        test_loss, test_accuracy = eval_model(state.params, test_ds, tol)
+        print(' test epoch: %d, loss: %.2f, accuracy: %.2f' % (
+            epoch, test_loss, test_accuracy * 100
+        ))
+if __name__ == '__main__':
+    train_and_evaluate(0.0001, 5, 128, 1.)

train_resnet.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from functools import partial
+import jax
+from typing import Any, Callable, Sequence, Optional, NewType
+from jax import lax, random, vmap, numpy as jnp
+from jax.experimental.ode import odeint
+import flax
+from flax.training import train_state
+from flax import traverse_util
+from flax.core import freeze, unfreeze
+from flax import linen as nn
+from flax import serialization
+import optax
+import tensorflow_datasets as tfds
+import numpy as np
+from tqdm import tqdm
+import os
+# Define residual blocks
+class ResDownBlock(nn.Module):
+    """Single ResBlock w/ downsample"""
+    dim_out: Any = 64
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(x))
+        x = nn.Conv(features=self.dim_out, kernel_size=(1, 1), strides=(2, 2))(x)
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(3, 3), strides=(2, 2))(f_x)
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(f_x))
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(3, 3))(f_x)
+        x = f_x + x
+        return x
+class ResBlock(nn.Module):
+    """Single Resblock w/o downsample"""
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(x))
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(self.ksize, self.ksize))(f_x)
+        f_x = nn.relu(nn.GroupNorm(self.dim_out)(f_x))
+        f_x = nn.Conv(features=self.dim_out, kernel_size=(self.ksize, self.ksize))(f_x)
+        x = f_x + x
+        return x
+# Define small ResNet for Mnist example
+class SmallResNet(nn.Module):
+    dim_out: Any = 64
+    ksize: Any = 3
+    @nn.compact
+    def __call__(self, inputs):
+        x = inputs
+        x = nn.Conv(features=self.dim_out, kernel_size=(self.ksize, self.ksize))(x)
+        x = ResDownBlock()(x)
+        x = ResDownBlock()(x)
+        x = ResBlock()(x)
+        x = ResBlock()(x)
+        x = ResBlock()(x)
+        x = ResBlock()(x)
+        x = ResBlock()(x)
+        x = ResBlock()(x)
+        x = nn.GroupNorm(self.dim_out)(x)
+        x = nn.relu(x)
+        x = nn.avg_pool(x, (1, 1))
+        x = x.reshape((x.shape[0], -1))     # flatten
+        x = nn.Dense(features=10)(x)
+        x = nn.log_softmax(x)
+        return x
+# Define loss
+@jax.jit
+def cross_entropy_loss(logits, labels):
+    one_hot_labels = jax.nn.one_hot(labels, num_classes=10)
+    return -jnp.mean(jnp.sum(one_hot_labels * logits, axis=-1))
+# Metric computation
+@jax.jit
+def compute_metrics(logits, labels):
+    loss = cross_entropy_loss(logits=logits, labels=labels)
+    accuracy = jnp.mean(jnp.argmax(logits, -1) == labels)
+    metrics = {
+        'loss': loss,
+        'accuracy': accuracy,
+    }
+    return metrics
+def get_datasets():
+    """Load MNIST train and test datasets into memory."""
+    ds_builder = tfds.builder('mnist')
+    ds_builder.download_and_prepare()
+    train_ds = tfds.as_numpy(ds_builder.as_dataset(split='train', batch_size=-1))
+    test_ds = tfds.as_numpy(ds_builder.as_dataset(split='test', batch_size=-1))
+    train_ds['image'] = jnp.float32(train_ds['image']) / 255.
+    test_ds['image'] = jnp.float32(test_ds['image']) / 255.
+    return train_ds, test_ds
+def create_train_state(rng, learning_rate):
+    """Creates initial 'TrainState'."""
+    resnet = SmallResNet()
+    params = resnet.init(rng, jnp.ones([1, 28, 28, 1]))['params']
+    tx = optax.adam(learning_rate)
+    return train_state.TrainState.create(
+        apply_fn=resnet.apply, params=params, tx=tx
+    )
+# Training step
+@jax.jit
+def train_step(state, batch):
+    """Train for a single step."""
+    def loss_fn(params):
+        logits = SmallResNet().apply({'params': params}, batch['image'])
+        loss = cross_entropy_loss(logits=logits, labels=batch['label'])
+        return loss, logits
+    grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
+    (_, logits), grads = grad_fn(state.params)
+    state = state.apply_gradients(grads=grads)
+    metrics = compute_metrics(logits=logits, labels=batch['label'])
+    return state, metrics
+# Evaluation step
+@jax.jit
+def eval_step(params, batch):
+    logits = SmallResNet().apply({'params': params}, batch['image'])
+    return compute_metrics(logits=logits, labels=batch['label'])
+# Train function
+def train_epoch(state, train_ds, batch_size, epoch, rng):
+    """Train for a single epoch"""
+    train_ds_size = len(train_ds['image'])
+    steps_per_epoch = train_ds_size // batch_size
+    perms = jax.random.permutation(rng, len(train_ds['image']))
+    perms = perms[:steps_per_epoch * batch_size]    # skip incomplete batch
+    perms = perms.reshape((steps_per_epoch, batch_size))
+    batch_metrics = []
+    for perm in tqdm(perms):
+        batch = {k: v[perm, ...] for k, v in train_ds.items()}
+        state, metrics = train_step(state, batch)
+        batch_metrics.append(metrics)
+        # compute mean of metrics across each batch in epoch.
+        batch_metrics_np = jax.device_get(batch_metrics)
+        epoch_metrics_np = {
+            k: np.mean([metrics[k] for metrics in batch_metrics_np])
+            for k in batch_metrics_np[0]
+        }
+    print('train epoch: %d, loss: %.4f, accuracy: %.2f' % (
+        epoch, epoch_metrics_np['loss'], epoch_metrics_np['accuracy'] * 100
+    ))
+    return state
+# Eval function
+def eval_model(params, test_ds):
+    metrics = eval_step(params, test_ds)
+    metrics = jax.device_get(metrics)
+    summary = jax.tree_map(lambda x: x.item(), metrics)
+    return summary['loss'], summary['accuracy']
+def train_and_evaluate(learning_rate, n_epoch, batch_size):
+    train_ds, test_ds = get_datasets()
+    rng = jax.random.PRNGKey(0)
+    rng, init_rng = jax.random.split(rng)
+    state = create_train_state(init_rng, learning_rate)
+    del init_rng  # Must not be used anymore.
+    for epoch in tqdm(range(1, n_epoch + 1)):
+        rng, input_rng = jax.random.split(rng)
+        state = train_epoch(state, train_ds, batch_size, epoch, input_rng)
+        test_loss, test_accuracy = eval_model(state.params, test_ds)
+        print(' test epoch: %d, loss: %.2f, accuracy: %.2f' % (
+            epoch, test_loss, test_accuracy * 100
+        ))