File size: 35,896 Bytes

b396e7b

/**
 * Soft Actor Critic Agent https://arxiv.org/abs/1812.05905
 * without value network.
 */
const AgentSac = (() => {
    /**
     * Validates the shape of a given tensor. 
     * 
     * @param {Tensor} tensor - tensor whose shape must be validated
     * @param {array} shape - shape to compare with
     * @param {string} [msg = ''] - message for the error
     */
    const assertShape = (tensor, shape, msg = '') => {
        console.assert(
            JSON.stringify(tensor.shape) === JSON.stringify(shape),
            msg + ' shape ' + tensor.shape + ' is not ' + shape)
    }

    // const VERSION = 1 // +100 for bump tower
    // const VERSION = 2 // balls
    // const VERSION = 3 // tests
    // const VERSION = 4 // tests
    // const VERSION = 5 // exp #1
    // const VERSION = 6 // exp #2
    // const VERSION = 7 // exp #3
    // const VERSION = 8 // exp #4
    // const VERSION = 9 // exp #
    // const VERSION = 10 // exp # good, doesn't touch
    // const VERSION = 11 // exp #
    // const VERSION = 12 // exp # 25x25
    // const VERSION = 13 // exp # 25x25 single CNN
    // const VERSION = 15 // 15.1 stable RB 10^5
    // const VERSION = 16 // reward from RL2, rb 10^6, gr/red balls, bad
    // const VERSION = 18 // reward from RL2, CNN from SAC paper, works!
    // const VERSION = 19 // moving balls, super!
    // const VERSION = 20 // moving balls, discret impulse, bad
    // const VERSION = 21 // independant look
    // const VERSION = 22 // dqn arch, bad
    // const VERSION = 23 // dqn trunc, works! fast learn
    // const VERSION = 24 // dqn trunc 3 layers, super and fast
    // const VERSION = 25 // dqn trunc 3 layers 2x512, poor
    // const VERSION = 26 // rl2 cnn arc, bad too many weights
    // const VERSION = 27 // sac cnn 16x6x3->16x4x2->8x3x1->2x256 and 2 clr frames, 2h, kiss, Excellent!
    // const VERSION = 28 // same but 1 frame, works
    // const VERSION = 29 // 1fr w/o accel, poor
    // const VERSION = 30 // 2fr wide img, poor
    // const VERSION = 31 // 2 small imgs, small cnn out, poor
    // const VERSION = 32 // 2fr binacular
    // const VERSION = 33 // 4fr binacular, Good, but poor after reload on wider cage
    // const VERSION = 34 // 4fr binacular, smaller fov=2, angle 0.7, poor
    // const VERSION = 35 // 4fr binacular with dist, poor
    // const VERSION = 36 // 4fr binacular with dist, works but reload not
    // const VERSION = 37 // BCNN achiasma, good -> reload poor
    // const VERSION = 38 // BCNN achiasma, smaller cnn
    // const VERSION = 39 // 1fr BCNN achiasma, smaller cnn, works super fast, 30min
    // const VERSION = 40 // 2fr BCNN achiasma, 2l smaller cnn, poor
    // const VERSION = 41 // 2fr BCNN achiasma, 2l smaller cnn, some perfm after 30min
    // const VERSION = 41 // 1fr BCNN achiasma, 2l smaller cnn, super kiss, reload poor
    // const VERSION = 42 // 2fr BCNN achiasma, 2l smaller cnn, reload poor
    // const VERSION = 43 // 1fr BCNN achiasma, 3l, fov 0.8, 1h good, reload not bad
    // const VERSION = 44 // 2fr BCNN achiasma, 3l, fov 0.8, slow 1h, reload not bad, a bit better than 1fr, degrade
    // const VERSION = 45 // 1fr BCNN achiasma, 2l, fov 0.8, poor
    // const VERSION = 46 // 2fr BCNN achiasma, 2l, fov 0.8, fast 30 min but poor on reload
    // const VERSION = 47 // 1fr BCNN chiasma, 2l, fov 0.7, poor
    // const VERSION = 48 // 2fr BCNN chiasma, 2l, fov 0.7 poor
    // const VERSION = 49 // 1fr BCNN chiasma stacked, 3l, poor
    // const VERSION = 50 // 2fr 2nets monocular, 1h good, reload poor
    // const VERSION = 51 // 1fr 1nets monocular, stuck
    // const VERSION = 52 // 2fr 2nets monocular, poor
    // const VERSION = 53 // 2fr 2nets monocular, 
    // const VERSION = 54 // 2fr binocular
    // const VERSION = 55 // 2fr binocular
    // const VERSION = 56 // 2fr binocular
    // const VERSION = 57 // 1fr binocular, sphere vimeo super
    // const VERSION = 58 // 2fr binocular, sphere
    // const VERSION = 59 // 1fr binocular, sphere
    // const VERSION = 61 // 2fr binocular, sphere, 2lay BASELINE!!! cage 55, mass 2, ball mass 1
    // const VERSION = 62
    //const VERSION = 63 // 1fr 30min! cage 60
    // const VERSION = 64 // 2fr nores
    // const VERSION = 66 // 1fr 30min slightly slower
    // const VERSION = 67 // 2fr 30min as prev
    // const VERSION = 65 // 1fr l/r diff, 30min +400
    // const VERSION = 68 // 1fr l/r diff, 30min -100 good
    // const VERSION = 69 // 1fr l/r diff, 30min -190 good
    // const VERSION = 70 // 1fr l/r diff, 30min -420
    // const VERSION = 71 // 1fr l/r diff, 30min -480
    // const VERSION = 72 // 1fr no diff, 30min 
    // const VERSION = 73 // 1fr no diff, 30min -400 cage 50
    // const VERSION = 74 // 1fr diff, 30min 2.6k!
    // const VERSION = 75 // 1fr diff, 30min -300
    // const VERSION = 76 // 1fr diff, 20min +300!
    // const VERSION = 77 // 1fr diff, 20min +3.5k!
    // const VERSION = 78 // 1fr diff, 30min -90
    // const VERSION = 79 // 1fr NO diff, 25min +158
    // const VERSION = 80 // 1fr NO diff, 30min -200
    // const VERSION = 81 // 1fr NO diff, 20min +1200
    // const VERSION = 82 // 1fr NO diff, 30min
    // const VERSION = 83 // 1fr NO diff, priority 30min -400
    const VERSION = 84 // 1fr diff, 30min

    const LOG_STD_MIN = -20
    const LOG_STD_MAX = 2
    const EPSILON = 1e-8
    const NAME = {
        ACTOR: 'actor',
        Q1: 'q1',
        Q2: 'q2',        
        Q1_TARGET: 'q1-target',
        Q2_TARGET: 'q2-target',
        ALPHA: 'alpha'
    }

    return class AgentSac {
        constructor({
            batchSize = 1, 
            frameShape = [25, 25, 3], 
            nFrames = 1, // Number of stacked frames per state
            nActions = 3, // 3 - impuls, 3 - RGB color
            nTelemetry = 10, // 3 - linear valocity, 3 - acceleration, 3 - collision point, 1 - lidar (tanh of distance)
            gamma = 0.99, // Discount factor (γ)
            tau = 5e-3, // Target smoothing coefficient (τ)
            trainable = true, // Whether the actor is trainable
            verbose = false,
            forced = false, // force to create fresh models (not from checkpoint)
            prefix = '', // for tests,
            sighted = true,
            rewardScale = 10
        } = {}) {
            this._batchSize = batchSize
            this._frameShape = frameShape 
            this._nFrames = nFrames
            this._nActions = nActions
            this._nTelemetry = nTelemetry
            this._gamma = gamma
            this._tau = tau
            this._trainable = trainable
            this._verbose = verbose
            this._inited = false
            this._prefix = (prefix === '' ? '' : prefix + '-')
            this._forced = forced
            this._sighted = sighted
            this._rewardScale = rewardScale
            
            this._frameStackShape = [...this._frameShape.slice(0, 2), this._frameShape[2] * this._nFrames]

            // https://github.com/rail-berkeley/softlearning/blob/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/algorithms/sac.py#L37
            this._targetEntropy = -nActions
        }

        /**
         * Initialization.
         */
        async init() {
            if (this._inited) throw Error('щ（ﾟДﾟщ）')

            this._frameInputL = tf.input({batchShape : [null, ...this._frameStackShape]})
            this._frameInputR = tf.input({batchShape : [null, ...this._frameStackShape]})

            this._telemetryInput = tf.input({batchShape : [null, this._nTelemetry]})
            
            this.actor = await this._getActor(this._prefix + NAME.ACTOR, this.trainable)
            
            if (!this._trainable)
                return
            
            this.actorOptimizer = tf.train.adam()

            this._actionInput = tf.input({batchShape : [null, this._nActions]})

            this.q1 = await this._getCritic(this._prefix + NAME.Q1)
            this.q1Optimizer = tf.train.adam()

            this.q2 = await this._getCritic(this._prefix + NAME.Q2)
            this.q2Optimizer = tf.train.adam()

            this.q1Targ = await this._getCritic(this._prefix + NAME.Q1_TARGET, true) // true for batch norm
            this.q2Targ = await this._getCritic(this._prefix + NAME.Q2_TARGET, true)

            this._logAlpha = await this._getLogAlpha(this._prefix + NAME.ALPHA)
            this.alphaOptimizer = tf.train.adam()

            this.updateTargets(1)

            // console.log('weights actorr', this.actor.getWeights().map(w => w.arraySync()))
            // console.log('weights q1q1q1', this.q1.getWeights().map(w => w.arraySync()))
            // console.log('weights q2Targ', this.q2Targ.getWeights().map(w => w.arraySync()))

            this._inited = true
        }

        /**
         * Trains networks on a batch from the replay buffer.
         * 
         * @param {{ state, action, reward, nextState }} - trnsitions in batch
         * @returns {void} nothing
         */
        train({ state, action, reward, nextState }) {
            if (!this._trainable)
                throw new Error('Actor is not trainable')

            return tf.tidy(() => {
                assertShape(state[0], [this._batchSize, this._nTelemetry], 'telemetry')
                assertShape(state[1], [this._batchSize, ...this._frameStackShape], 'frames')
                assertShape(action, [this._batchSize, this._nActions], 'action')
                assertShape(reward, [this._batchSize, 1], 'reward')
                assertShape(nextState[0], [this._batchSize, this._nTelemetry], 'nextState telemetry')
                assertShape(nextState[1], [this._batchSize, ...this._frameStackShape], 'nextState frames')

                this._trainCritics({ state, action, reward, nextState })
                this._trainActor(state)
                this._trainAlpha(state)
                
                this.updateTargets()
            })
        }

        /**
         * Train Q-networks.
         * 
         * @param {{ state, action, reward, nextState }} transition - transition
         */
        _trainCritics({ state, action, reward, nextState }) {
            const getQLossFunction = (() => {
                const [nextFreshAction, logPi] = this.sampleAction(nextState, true)

                const q1TargValue = this.q1Targ.predict(
                    this._sighted ? [...nextState, nextFreshAction] : [nextState[0], nextFreshAction], 
                    {batchSize: this._batchSize})
                const q2TargValue = this.q2Targ.predict(
                    this._sighted ? [...nextState, nextFreshAction] : [nextState[0], nextFreshAction], 
                    {batchSize: this._batchSize})
                
                const qTargValue = tf.minimum(q1TargValue, q2TargValue)
    
                // y = r + γ*(1 - d)*(min(Q1Targ(s', a'), Q2Targ(s', a')) - α*log(π(s'))
                const alpha = this._getAlpha()
                const target = reward.mul(tf.scalar(this._rewardScale)).add(
                    tf.scalar(this._gamma).mul(
                        qTargValue.sub(alpha.mul(logPi))
                    )
                )
                            
                assertShape(nextFreshAction, [this._batchSize, this._nActions], 'nextFreshAction')
                assertShape(logPi, [this._batchSize, 1], 'logPi')
                assertShape(qTargValue, [this._batchSize, 1], 'qTargValue')
                assertShape(target, [this._batchSize, 1], 'target')
    
                return (q) => () => {
                    const qValue = q.predict(
                        this._sighted ? [...state, action] : [state[0], action],
                        {batchSize: this._batchSize})
                    
                    // const loss = tf.scalar(0.5).mul(tf.losses.meanSquaredError(qValue, target))
                    const loss = tf.scalar(0.5).mul(tf.mean(qValue.sub(target).square()))
                    
                    assertShape(qValue, [this._batchSize, 1], 'qValue')

                    return loss
                }
            })()
    
            for (const [q, optimizer] of [
                [this.q1, this.q1Optimizer],
                [this.q2, this.q2Optimizer]
            ]) {
                const qLossFunction = getQLossFunction(q)
    
                const { value, grads } = tf.variableGrads(qLossFunction, q.getWeights(true)) // true means trainableOnly
                
                optimizer.applyGradients(grads)
                
                if (this._verbose) console.log(q.name + ' Loss: ' + value.arraySync())
            }
        }

        /**
         * Train actor networks.
         * 
         * @param {state} state 
         */
        _trainActor(state) {
            // TODO: consider delayed update of policy and targets (if possible)
            const actorLossFunction = () => {
                const [freshAction, logPi] = this.sampleAction(state, true)
                
                const q1Value = this.q1.predict(
                    this._sighted ? [...state, freshAction] : [state[0], freshAction],
                    {batchSize: this._batchSize})
                const q2Value = this.q2.predict(
                    this._sighted ? [...state, freshAction] : [state[0], freshAction], 
                    {batchSize: this._batchSize})
                
                const criticValue = tf.minimum(q1Value, q2Value)

                const alpha = this._getAlpha()
                const loss = alpha.mul(logPi).sub(criticValue)

                assertShape(freshAction, [this._batchSize, this._nActions], 'freshAction')
                assertShape(logPi, [this._batchSize, 1], 'logPi')
                assertShape(q1Value, [this._batchSize, 1], 'q1Value')
                assertShape(criticValue, [this._batchSize, 1], 'criticValue')
                assertShape(loss, [this._batchSize, 1], 'alpha loss')

                return tf.mean(loss)
            }
            
            const { value, grads } = tf.variableGrads(actorLossFunction, this.actor.getWeights(true)) // true means trainableOnly
            
            this.actorOptimizer.applyGradients(grads)

            if (this._verbose) console.log('Actor Loss: ' + value.arraySync())
        }

        _trainAlpha(state) {
            const alphaLossFunction = () => {
                const [, logPi] = this.sampleAction(state, true)

                const alpha = this._getAlpha()
                const loss = tf.scalar(-1).mul(
                    alpha.mul( // TODO: not sure whether this should be alpha or logAlpha
                        logPi.add(tf.scalar(this._targetEntropy))
                    )
                )

                assertShape(loss, [this._batchSize, 1], 'alpha loss')

                return tf.mean(loss)
            }
            
            const { value, grads } = tf.variableGrads(alphaLossFunction, [this._logAlpha]) // true means trainableOnly
            
            this.alphaOptimizer.applyGradients(grads)
            
            if (this._verbose) console.log('Alpha Loss: ' + value.arraySync(), tf.exp(this._logAlpha).arraySync())
        }

        /**
         * Soft update target Q-networks.
         * 
         * @param {number} [tau = this._tau] - smoothing constant τ for exponentially moving average: `wTarg <- wTarg*(1-tau) + w*tau`
         */
        updateTargets(tau = this._tau) {
            tau = tf.scalar(tau)

            const
                q1W = this.q1.getWeights(),
                q2W = this.q2.getWeights(),
                q1WTarg = this.q1Targ.getWeights(),
                q2WTarg = this.q2Targ.getWeights(),
                len = q1W.length

            // console.log('updateTargets q1W', q1W.map(w=>w.arraySync()))
            // console.log('updateTargets q1WTarg', q1WTarg.map(w=>w.arraySync()))

            const calc = (w, wTarg) => wTarg.mul(tf.scalar(1).sub(tau)).add(w.mul(tau))
            
            const w1 = [], w2 = []
            for (let i = 0; i < len; i++) {
                w1.push(calc(q1W[i], q1WTarg[i]))
                w2.push(calc(q2W[i], q2WTarg[i]))
            }
            
            this.q1Targ.setWeights(w1)
            this.q2Targ.setWeights(w2)


        }

        /**
         * Returns actions sampled from normal distribution using means and stds predicted by the actor.
         * 
         * @param {Tensor[]} state - state
         * @param {Tensor} [withLogProbs = false] - whether return log probabilities
         * @returns {Tensor || Tensor[]} action and log policy
         */
        sampleAction(state, withLogProbs = false) { // timer ~3ms
            return tf.tidy(() => {
                let [ mu, logStd ] = this.actor.predict(this._sighted ? state : state[0], {batchSize: this._batchSize})

                // https://github.com/rail-berkeley/rlkit/blob/c81509d982b4d52a6239e7bfe7d2540e3d3cd986/rlkit/torch/sac/policies/gaussian_policy.py#L106
                logStd = tf.clipByValue(logStd, LOG_STD_MIN, LOG_STD_MAX) 
                
                const std = tf.exp(logStd)

                // sample normal N(mu = 0, std = 1)
                const normal = tf.randomNormal(mu.shape, 0, 1.0)
        
                // reparameterization trick: z = mu + std * epsilon
                let pi = mu.add(std.mul(normal))

                let logPi = this._gaussianLikelihood(pi, mu, logStd)

                ;({ pi, logPi } = this._applySquashing(pi, mu, logPi))

                if (!withLogProbs)
                    return pi
        
                return [pi, logPi]
            })
        }

        /**
         * Calculates log probability of normal distribution https://en.wikipedia.org/wiki/Log_probability.
         * Converted to js from https://github.com/tensorflow/probability/blob/f3777158691787d3658b5e80883fe1a933d48989/tensorflow_probability/python/distributions/normal.py#L183
         * 
         * @param {Tensor} x - sample from normal distribution with mean `mu` and std `std`
         * @param {Tensor} mu - mean
         * @param {Tensor} std - standart deviation
         * @returns {Tensor} log probability
         */
        _logProb(x, mu, std)  {
            const logUnnormalized = tf.scalar(-0.5).mul(
                tf.squaredDifference(x.div(std), mu.div(std))
            )
            const logNormalization = tf.scalar(0.5 * Math.log(2 * Math.PI)).add(tf.log(std))
        
            return logUnnormalized.sub(logNormalization)
        }

        /**
         * Gaussian likelihood.
         * Translated from https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/sac/core.py#L24
         * 
         * @param {Tensor} x - sample from normal distribution with mean `mu` and std `exp(logStd)`
         * @param {Tensor} mu - mean
         * @param {Tensor} logStd - log of standart deviation
         * @returns {Tensor} log probability
         */
        _gaussianLikelihood(x, mu, logStd) {
            // pre_sum = -0.5 * (
            //     ((x-mu)/(tf.exp(log_std)+EPS))**2 
            //     + 2*log_std 
            //     + np.log(2*np.pi)
            // )

            const preSum = tf.scalar(-0.5).mul(
                x.sub(mu).div(
                    tf.exp(logStd).add(tf.scalar(EPSILON))
                ).square()
                .add(tf.scalar(2).mul(logStd))
                .add(tf.scalar(Math.log(2 * Math.PI)))
            )

            return tf.sum(preSum, 1, true)
        }

        /**
         * Adjustment to log probability when squashing action with tanh
         * Enforcing Action Bounds formula derivation https://stats.stackexchange.com/questions/239588/derivation-of-change-of-variables-of-a-probability-density-function
         * Translated from https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/sac/core.py#L48
         * 
         * @param {*} pi - policy sample
         * @param {*} mu - mean
         * @param {*} logPi - log probability
         * @returns {{ pi, mu, logPi }} squashed and adjasted input
         */
        _applySquashing(pi, mu, logPi) {
            // logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1)

            const adj = tf.scalar(2).mul(
                tf.scalar(Math.log(2))
                .sub(pi)
                .sub(tf.softplus(
                    tf.scalar(-2).mul(pi)
                ))
            )

            logPi = logPi.sub(tf.sum(adj, 1, true))
            mu = tf.tanh(mu)
            pi = tf.tanh(pi)

            return { pi, mu, logPi }
        }

        /**
         * Builds actor network model.
         * 
         * @param {string} [name = 'actor'] - name of the model
         * @param {string} trainable - whether a critic is trainable
         * @returns {tf.LayersModel} model
         */
        async _getActor(name = 'actor', trainable = true) {
            const checkpoint = await this._loadCheckpoint(name)
            if (checkpoint) return checkpoint

            let outputs = this._telemetryInput
            // outputs = tf.layers.dense({units: 128, activation: 'relu'}).apply(outputs)

            if (this._sighted) {
                let convOutputL = this._getConvEncoder(this._frameInputL)
                let convOutputR = this._getConvEncoder(this._frameInputR)
                // let convOutput = tf.layers.concatenate().apply([convOutputL, convOutputR])
                // convOutput = tf.layers.dense({units: 10, activation: 'relu'}).apply(convOutput)

                outputs = tf.layers.concatenate().apply([convOutputL, convOutputR, outputs])
            }

            outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)
            outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)

            const mu     = tf.layers.dense({units: this._nActions}).apply(outputs)
            const logStd = tf.layers.dense({units: this._nActions}).apply(outputs)

            const model = tf.model({inputs: this._sighted ? [this._telemetryInput, this._frameInputL, this._frameInputR] : [this._telemetryInput], outputs: [mu, logStd], name})
            model.trainable = trainable

            if (this._verbose) {
                console.log('==========================')
                console.log('==========================')
                console.log('Actor ' + name + ': ')

                model.summary()
            }

            return model
        }

        /**
         * Builds a critic network model.
         * 
         * @param {string} [name = 'critic'] - name of the model
         * @param {string} trainable - whether a critic is trainable
         * @returns {tf.LayersModel} model
         */
        async _getCritic(name = 'critic', trainable = true) {
            const checkpoint = await this._loadCheckpoint(name)
            if (checkpoint) return checkpoint

            let outputs = tf.layers.concatenate().apply([this._telemetryInput, this._actionInput])
            // outputs = tf.layers.dense({units: 128, activation: 'relu'}).apply(outputs)

            if (this._sighted) {
                let convOutputL = this._getConvEncoder(this._frameInputL)
                let convOutputR = this._getConvEncoder(this._frameInputR)
                // let convOutput = tf.layers.concatenate().apply([convOutputL, convOutputR])
                // convOutput = tf.layers.dense({units: 10, activation: 'relu'}).apply(convOutput)

                outputs = tf.layers.concatenate().apply([convOutputL, convOutputR, outputs])
            }

            outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)
            outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)

            outputs = tf.layers.dense({units: 1}).apply(outputs)

            const model = tf.model({
                inputs: this._sighted 
                    ? [this._telemetryInput, this._frameInputL, this._frameInputR, this._actionInput] 
                    : [this._telemetryInput, this._actionInput],
                outputs, name
            })

            model.trainable = trainable

            if (this._verbose) {
                console.log('==========================')
                console.log('==========================')
                console.log('CRITIC ' + name + ': ')
        
                model.summary()
            }

            return model
        }

        // _encoder = null
        // _getConvEncoder(inputs) {
        //     if (!this._encoder)
        //         this._encoder = this.__getConvEncoder(inputs)
            
        //     return this._encoder
        // }

        /**
         * Builds convolutional part of a network.
         * 
         * @param {Tensor} inputs - input for the conv layers
         * @returns outputs
         */
         _getConvEncoder(inputs) {
            const kernelSize = 3
            const padding = 'valid'
            const poolSize = 3
            const strides = 1
            // const depthwiseInitializer = 'heNormal'
            // const pointwiseInitializer = 'heNormal'
            const kernelInitializer = 'glorotNormal'
            const biasInitializer = 'glorotNormal'

            let outputs = inputs
            
            // 32x8x4 -> 64x4x2 -> 64x3x1 -> 64x4x1
            outputs = tf.layers.conv2d({
                filters: 16,
                kernelSize: 5,
                strides: 2,
                padding,
                kernelInitializer,
                biasInitializer,
                activation: 'relu',
                trainable: true
            }).apply(outputs)
            outputs = tf.layers.maxPooling2d({poolSize:2}).apply(outputs)
            // 
            // outputs = tf.layers.layerNormalization().apply(outputs)

            outputs = tf.layers.conv2d({
                filters: 16,
                kernelSize: 3,
                strides: 1,
                padding,
                kernelInitializer,
                biasInitializer,
                activation: 'relu',
                trainable: true
            }).apply(outputs)
            outputs = tf.layers.maxPooling2d({poolSize:2}).apply(outputs)

            // outputs = tf.layers.layerNormalization().apply(outputs)
            
            // outputs = tf.layers.conv2d({
            //     filters: 12,
            //     kernelSize: 3,
            //     strides: 1,
            //     padding,
            //     kernelInitializer,
            //     biasInitializer,
            //     activation: 'relu',
            //     trainable: true
            // }).apply(outputs)

            // outputs = tf.layers.conv2d({
            //     filters: 10,
            //     kernelSize: 2,
            //     strides: 1,
            //     padding,
            //     kernelInitializer,
            //     biasInitializer,
            //     activation: 'relu',
            //     trainable: true
            // }).apply(outputs)

            // outputs = tf.layers.conv2d({
            //     filters: 64,
            //     kernelSize: 4,
            //     strides: 1,
            //     padding,
            //     kernelInitializer,
            //     biasInitializer,
            //     activation: 'relu'
            // }).apply(outputs)

            // outputs = tf.layers.batchNormalization().apply(outputs)

            // outputs = tf.layers.layerNormalization().apply(outputs)

            outputs = tf.layers.flatten().apply(outputs)

            // convOutputs = tf.layers.dense({units: 96, activation: 'relu'}).apply(convOutputs)

            return outputs
        }

        /**
         * Returns clipped alpha.
         * 
         * @returns {Tensor} entropy
         */
        _getAlpha() {
            // return tf.maximum(tf.exp(this._logAlpha), tf.scalar(this._minAlpha))
            return tf.exp(this._logAlpha)
        }

        /**
         * Builds a log of entropy scale (α) for training.
         * 
         * @param {string} name 
         * @returns {tf.Variable} trainable variable for log entropy
         */
        async _getLogAlpha(name = 'alpha') {
            let logAlpha = 0.0

            const checkpoint = await this._loadCheckpoint(name)
            if (checkpoint) {
                logAlpha = checkpoint.getWeights()[0].arraySync()[0][0]

                if (this._verbose)
                    console.log('Checkpoint alpha: ', logAlpha)
                    
                this._logAlphaPlaceholder = checkpoint
            } else {
                const model = tf.sequential({ name });
                model.add(tf.layers.dense({ units: 1, inputShape: [1], useBias: false }))
                model.setWeights([tf.tensor([logAlpha], [1, 1])])

                this._logAlphaPlaceholder = model
            }

            return tf.variable(tf.scalar(logAlpha), true) // true -> trainable
        }

        /**
         * Saves all agent's models to the storage.
         */
        async checkpoint() {
            if (!this._trainable) throw new Error('(╭ರ_ ⊙ )')

            this._logAlphaPlaceholder.setWeights([tf.tensor([this._logAlpha.arraySync()], [1, 1])])

            await Promise.all([
                this._saveCheckpoint(this.actor),
                this._saveCheckpoint(this.q1),
                this._saveCheckpoint(this.q2),
                this._saveCheckpoint(this.q1Targ),
                this._saveCheckpoint(this.q2Targ),
                this._saveCheckpoint(this._logAlphaPlaceholder)
            ])

            if (this._verbose) 
                console.log('Checkpoint succesfully saved')
        }

        /**
         * Saves a model to the storage.
         * 
         * @param {tf.LayersModel} model 
         */
        async _saveCheckpoint(model) {
            const key = this._getChKey(model.name)
            const saveResults = await model.save(key)

            if (this._verbose) 
                console.log('Checkpoint saveResults', model.name, saveResults)
        }

        /**
         * Loads saved checkpoint from the storage.
         * 
         * @param {string} name model name
         * @returns {tf.LayersModel} model
         */
        async _loadCheckpoint(name) {
// return
            if (this._forced) {
                console.log('Forced to not load from the checkpoint ' + name)
                return
            }

            const key = this._getChKey(name)
            const modelsInfo = await tf.io.listModels()

            if (key in modelsInfo) {
                const model = await tf.loadLayersModel(key)

                if (this._verbose) 
                    console.log('Loaded checkpoint for ' + name)

                return model
            }
            
            if (this._verbose) 
                console.log('Checkpoint not found for ' + name)
        }
        
        /**
         * Builds the key for the model weights in LocalStorage.
         * 
         * @param {tf.LayersModel} name model name
         * @returns {string} key
         */
        _getChKey(name) {
            return 'indexeddb://' + name + '-' + VERSION
        }
    }
})()

/* TESTS */
;(async () => {
    return 

    // https://www.wolframalpha.com/input/?i2d=true&i=y%5C%2840%29x%5C%2844%29+%CE%BC%5C%2844%29+%CF%83%5C%2841%29+%3D+ln%5C%2840%29Divide%5B1%2CSqrt%5B2*%CF%80*Power%5B%CF%83%2C2%5D%5D%5D*Exp%5B-Divide%5B1%2C2%5D*%5C%2840%29Divide%5BPower%5B%5C%2840%29x-%CE%BC%5C%2841%29%2C2%5D%2CPower%5B%CF%83%2C2%5D%5D%5C%2841%29%5D%5C%2841%29
    ;(() => {
        const agent = new AgentSac()

        const 
            mu = tf.tensor([0], [1, 1]),     // mu = 0
            logStd = tf.tensor([0], [1, 1]), // logStd = 0
            std = tf.exp(logStd),            // std = 1
            normal = tf.tensor([0], [1, 1]), // N = 0
            pi = mu.add(std.mul(normal))     // x = 0
    
        const log = agent._gaussianLikelihood(pi, mu, logStd)

        console.assert(log.arraySync()[0][0].toFixed(5) === '-0.91894', 
            'test Gaussian Likelihood for μ=0, σ=1, x=0')
    })()

    ;(() => {
        const agent = new AgentSac()

        const 
            mu = tf.tensor([1], [1, 1]),     // mu = 1
            logStd = tf.tensor([1], [1, 1]), // logStd = 1
            std = tf.exp(logStd),            // std = e
            normal = tf.tensor([0], [1, 1]), // N = 0
            pi = mu.add(std.mul(normal))    // x = 1
    
        const log = agent._gaussianLikelihood(pi, mu, logStd)

        console.assert(log.arraySync()[0][0].toFixed(5) === '-1.91894',
            'test Gaussian Likelihood for μ=1, σ=e, x=0')
    })()

    ;(() => {
        const agent = new AgentSac()

        const 
            mu = tf.tensor([1], [1, 1]),     // mu = -1
            logStd = tf.tensor([1], [1, 1]), // logStd = 1
            std = tf.exp(logStd),            // std = e
            normal = tf.tensor([0.1], [1, 1]), // N = 0
            pi = mu.add(std.mul(normal))    // x = -1.27182818
    
        const logPi = agent._gaussianLikelihood(pi, mu, logStd)
        const { pi: piSquashed, logPi: logPiSquashed } = agent._applySquashing(pi, mu, logPi)

        const logProbBounded = logPi.sub(
          tf.log(
            tf.scalar(1)
              .sub(tf.tanh(pi).pow(tf.scalar(2)))
              // .add(EPSILON)
          )
        ).sum(1, true)
        
        console.assert(logPi.arraySync()[0][0].toFixed(5) === '-1.92394',
            'test Gaussian Likelihood for μ=-1, σ=e, x=-1.27182818')

        console.assert(logPiSquashed.arraySync()[0][0].toFixed(5) === logProbBounded.arraySync()[0][0].toFixed(5),
            'test logPiSquashed for μ=-1, σ=e, x=-1.27182818')

        console.assert(piSquashed.arraySync()[0][0].toFixed(5) === tf.tanh(pi).arraySync()[0][0].toFixed(5),
            'test piSquashed for μ=-1, σ=e, x=-1.27182818')
    })()

    await (async () => {
        const state = tf.tensor([
            0.5, 0.3, -0.9,
            0, -0.8, 1,
            -0.3, 0.04, 0.02,
            0.9
        ], [1, 10])

        const action = tf.tensor([
            0.1, -1, -0.4,
            1, -0.8, -0.8, -0.2,
            0.04, 0.02, 0.001
        ], [1, 10])
        
        const fresh = new AgentSac({ prefix: 'test', forced: true })
        await fresh.init()
        await fresh.checkpoint()
        
        const saved = new AgentSac({ prefix: 'test' })
        await saved.init()
        
        let frPred, saPred

        frPred = fresh.actor.predict(state, {batchSize: 1})
        saPred = saved.actor.predict(state, {batchSize: 1})
        console.assert(
            frPred[0].arraySync().length > 0 &&
            frPred[1].arraySync().length > 0 &&
            frPred[0].arraySync().join(';') === saPred[0].arraySync().join(';') &&
            frPred[1].arraySync().join(';') === saPred[1].arraySync().join(';'),
            'Models loaded from the checkpoint should be the same')
        
        frPred = fresh.q1.predict([state, action], {batchSize: 1})
        saPred = fresh.q1Targ.predict([state, action], {batchSize: 1})
        console.assert(
            frPred.arraySync()[0][0] !== undefined &&
            frPred.arraySync()[0][0] === saPred.arraySync()[0][0],
            'Q1 and Q1-target should be the same')

        frPred = fresh.q2.predict([state, action], {batchSize: 1})
        saPred = saved.q2.predict([state, action], {batchSize: 1})
        console.assert(
            frPred.arraySync()[0][0] !== undefined &&
            frPred.arraySync()[0][0] === saPred.arraySync()[0][0],
            'Q and Q restored should be the same')

        console.assert(
            fresh._logAlpha.arraySync() !== undefined &&
            fresh._logAlpha.arraySync() === fresh._logAlpha.arraySync(),
            'Q and Q restored should be the same')
    })()
})()