{"path": "numpy_ml/__init__.py", "content": ["# noqa\n", "\"\"\"Common ML and ML-adjacent algorithms implemented in NumPy\"\"\"\n", "\n", "from . import utils\n", "from . import preprocessing\n", "\n", "from . import gmm\n", "from . import hmm\n", "from . import lda\n", "from . import linear_models\n", "from . import neural_nets\n", "from . import ngram\n", "from . import nonparametric\n", "from . import rl_models\n", "from . import trees\n", "from . import bandits\n", "from . import factorization\n", "from . import tests"]} {"path": "numpy_ml/gmm/gmm.py", "content": ["\"\"\"A Gaussian mixture model class\"\"\"\n", "import numpy as np\n", "\n", "from numpy_ml.utils.misc import logsumexp, log_gaussian_pdf\n", "\n", "\n", "class GMM(object):\n", " def __init__(self, C=3, seed=None):\n", " \"\"\"\n", " A Gaussian mixture model trained via the expectation maximization\n", " algorithm.\n", "\n", " Parameters\n", " ----------\n", " C : int\n", " The number of clusters / mixture components in the GMM. Default is\n", " 3.\n", " seed : int\n", " Seed for the random number generator. Default is None.\n", "\n", " Attributes\n", " ----------\n", " N : int\n", " The number of examples in the training dataset.\n", " d : int\n", " The dimension of each example in the training dataset.\n", " pi : :py:class:`ndarray ` of shape `(C,)`\n", " The cluster priors.\n", " Q : :py:class:`ndarray ` of shape `(N, C)`\n", " The variational distribution `q(T)`.\n", " mu : :py:class:`ndarray ` of shape `(C, d)`\n", " The cluster means.\n", " sigma : :py:class:`ndarray ` of shape `(C, d, d)`\n", " The cluster covariance matrices.\n", " \"\"\"\n", " self.elbo = None\n", " self.parameters = {}\n", " self.hyperparameters = {\n", " \"C\": C,\n", " \"seed\": seed,\n", " }\n", "\n", " self.is_fit = False\n", "\n", " if seed:\n", " np.random.seed(seed)\n", "\n", " def _initialize_params(self, X):\n", " \"\"\"Randomly initialize the starting GMM parameters.\"\"\"\n", " N, d = X.shape\n", " C = self.hyperparameters[\"C\"]\n", "\n", " rr = np.random.rand(C)\n", "\n", " self.parameters = {\n", " \"pi\": rr / rr.sum(), # cluster priors\n", " \"Q\": np.zeros((N, C)), # variational distribution q(T)\n", " \"mu\": np.random.uniform(-5, 10, C * d).reshape(C, d), # cluster means\n", " \"sigma\": np.array([np.eye(d) for _ in range(C)]), # cluster covariances\n", " }\n", "\n", " self.elbo = None\n", " self.is_fit = False\n", "\n", " def likelihood_lower_bound(self, X):\n", " \"\"\"Compute the LLB under the current GMM parameters.\"\"\"\n", " N = X.shape[0]\n", " P = self.parameters\n", " C = self.hyperparameters[\"C\"]\n", " pi, Q, mu, sigma = P[\"pi\"], P[\"Q\"], P[\"mu\"], P[\"sigma\"]\n", "\n", " eps = np.finfo(float).eps\n", " expec1, expec2 = 0.0, 0.0\n", " for i in range(N):\n", " x_i = X[i]\n", "\n", " for c in range(C):\n", " pi_k = pi[c]\n", " z_nk = Q[i, c]\n", " mu_k = mu[c, :]\n", " sigma_k = sigma[c, :, :]\n", "\n", " log_pi_k = np.log(pi_k + eps)\n", " log_p_x_i = log_gaussian_pdf(x_i, mu_k, sigma_k)\n", " prob = z_nk * (log_p_x_i + log_pi_k)\n", "\n", " expec1 += prob\n", " expec2 += z_nk * np.log(z_nk + eps)\n", "\n", " loss = expec1 - expec2\n", " return loss\n", "\n", " def fit(self, X, max_iter=100, tol=1e-3, verbose=False):\n", " \"\"\"\n", " Fit the parameters of the GMM on some training data.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, d)`\n", " A collection of `N` training data points, each with dimension `d`.\n", " max_iter : int\n", " The maximum number of EM updates to perform before terminating\n", " training. Default is 100.\n", " tol : float\n", " The convergence tolerance. Training is terminated if the difference\n", " in VLB between the current and previous iteration is less than\n", " `tol`. Default is 1e-3.\n", " verbose : bool\n", " Whether to print the VLB at each training iteration. Default is\n", " False.\n", "\n", " Returns\n", " -------\n", " success : {0, -1}\n", " Whether training terminated without incident (0) or one of the\n", " mixture components collapsed and training was halted prematurely\n", " (-1).\n", " \"\"\"\n", " prev_vlb = -np.inf\n", " self._initialize_params(X)\n", "\n", " for _iter in range(max_iter):\n", " try:\n", " self._E_step(X)\n", " self._M_step(X)\n", " vlb = self.likelihood_lower_bound(X)\n", "\n", " if verbose:\n", " print(f\"{_iter + 1}. Lower bound: {vlb}\")\n", "\n", " converged = _iter > 0 and np.abs(vlb - prev_vlb) <= tol\n", " if np.isnan(vlb) or converged:\n", " break\n", "\n", " prev_vlb = vlb\n", "\n", " except np.linalg.LinAlgError:\n", " print(\"Singular matrix: components collapsed\")\n", " return -1\n", "\n", " self.elbo = vlb\n", " self.is_fit = True\n", " return 0\n", "\n", " def predict(self, X, soft_labels=True):\n", " \"\"\"\n", " Return the log probability of each data point in `X` under each\n", " mixture components.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(M, d)`\n", " A collection of `M` data points, each with dimension `d`.\n", " soft_labels : bool\n", " If True, return the log probabilities of the M data points in X\n", " under each mixture component. If False, return only the ID of the\n", " most probable mixture. Default is True.\n", "\n", " Returns\n", " -------\n", " y : :py:class:`ndarray ` of shape `(M, C)` or `(M,)`\n", " If `soft_labels` is True, `y` is a 2D array where index (i,j) gives\n", " the log probability of the `i` th data point under the `j` th\n", " mixture component. If `soft_labels` is False, `y` is a 1D array\n", " where the `i` th index contains the ID of the most probable mixture\n", " component.\n", " \"\"\"\n", " assert self.is_fit, \"Must call the `.fit` method before making predictions\"\n", "\n", " P = self.parameters\n", " C = self.hyperparameters[\"C\"]\n", " mu, sigma = P[\"mu\"], P[\"sigma\"]\n", "\n", " y = []\n", " for x_i in X:\n", " cprobs = [log_gaussian_pdf(x_i, mu[c, :], sigma[c, :, :]) for c in range(C)]\n", "\n", " if not soft_labels:\n", " y.append(np.argmax(cprobs))\n", " else:\n", " y.append(cprobs)\n", "\n", " return np.array(y)\n", "\n", " def _E_step(self, X):\n", " P = self.parameters\n", " C = self.hyperparameters[\"C\"]\n", " pi, Q, mu, sigma = P[\"pi\"], P[\"Q\"], P[\"mu\"], P[\"sigma\"]\n", "\n", " for i, x_i in enumerate(X):\n", " denom_vals = []\n", " for c in range(C):\n", " pi_c = pi[c]\n", " mu_c = mu[c, :]\n", " sigma_c = sigma[c, :, :]\n", "\n", " log_pi_c = np.log(pi_c)\n", " log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c)\n", "\n", " # log N(X_i | mu_c, Sigma_c) + log pi_c\n", " denom_vals.append(log_p_x_i + log_pi_c)\n", "\n", " # log \\sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ]\n", " log_denom = logsumexp(denom_vals)\n", " q_i = np.exp([num - log_denom for num in denom_vals])\n", " np.testing.assert_allclose(np.sum(q_i), 1, err_msg=\"{}\".format(np.sum(q_i)))\n", "\n", " Q[i, :] = q_i\n", "\n", " def _M_step(self, X):\n", " N, d = X.shape\n", " P = self.parameters\n", " C = self.hyperparameters[\"C\"]\n", " pi, Q, mu, sigma = P[\"pi\"], P[\"Q\"], P[\"mu\"], P[\"sigma\"]\n", "\n", " denoms = np.sum(Q, axis=0)\n", "\n", " # update cluster priors\n", " pi = denoms / N\n", "\n", " # update cluster means\n", " nums_mu = [np.dot(Q[:, c], X) for c in range(C)]\n", " for ix, (num, den) in enumerate(zip(nums_mu, denoms)):\n", " mu[ix, :] = num / den if den > 0 else np.zeros_like(num)\n", "\n", " # update cluster covariances\n", " for c in range(C):\n", " mu_c = mu[c, :]\n", " n_c = denoms[c]\n", "\n", " outer = np.zeros((d, d))\n", " for i in range(N):\n", " wic = Q[i, c]\n", " xi = X[i, :]\n", " outer += wic * np.outer(xi - mu_c, xi - mu_c)\n", "\n", " outer = outer / n_c if n_c > 0 else outer\n", " sigma[c, :, :] = outer\n", "\n", " np.testing.assert_allclose(np.sum(pi), 1, err_msg=\"{}\".format(np.sum(pi)))\n"]} {"path": "numpy_ml/gmm/__init__.py", "content": ["from .gmm import *\n"]} {"path": "numpy_ml/nonparametric/kernel_regression.py", "content": ["from ..utils.kernels import KernelInitializer\n", "\n", "\n", "class KernelRegression:\n", " def __init__(self, kernel=None):\n", " \"\"\"\n", " A Nadaraya-Watson kernel regression model.\n", "\n", " Notes\n", " -----\n", " The Nadaraya-Watson regression model is\n", "\n", " .. math::\n", "\n", " f(x) = \\sum_i w_i(x) y_i\n", "\n", " where the sample weighting functions, :math:`w_i`, are simply\n", "\n", " .. math::\n", "\n", " w_i(x) = \\\\frac{k(x, x_i)}{\\sum_j k(x, x_j)}\n", "\n", " with `k` being the kernel function.\n", "\n", " Observe that `k`-nearest neighbors\n", " (:class:`~numpy_ml.nonparametric.KNN`) regression is a special case of\n", " kernel regression where the `k` closest observations have a weight\n", " `1/k`, and all others have weight 0.\n", "\n", " Parameters\n", " ----------\n", " kernel : str, :doc:`Kernel ` object, or dict\n", " The kernel to use. If None, default to\n", " :class:`~numpy_ml.utils.kernels.LinearKernel`. Default is None.\n", " \"\"\"\n", " self.parameters = {\"X\": None, \"y\": None}\n", " self.hyperparameters = {\"kernel\": str(kernel)}\n", " self.kernel = KernelInitializer(kernel)()\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Fit the regression model to the data and targets in `X` and `y`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " An array of N examples to generate predictions on\n", " y : :py:class:`ndarray ` of shape `(N, ...)`\n", " Predicted targets for the `N` rows in `X`\n", " \"\"\"\n", " self.parameters = {\"X\": X, \"y\": y}\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Generate predictions for the targets associated with the rows in `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N', M')`\n", " An array of `N'` examples to generate predictions on\n", "\n", " Returns\n", " -------\n", " y : :py:class:`ndarray ` of shape `(N', ...)`\n", " Predicted targets for the `N'` rows in `X`\n", " \"\"\"\n", " K = self.kernel\n", " P = self.parameters\n", " sim = K(P[\"X\"], X)\n", " return (sim * P[\"y\"][:, None]).sum(axis=0) / sim.sum(axis=0)\n"]} {"path": "numpy_ml/nonparametric/gp.py", "content": ["import warnings\n", "import numpy as np\n", "from numpy.linalg import slogdet, inv\n", "\n", "try:\n", " _SCIPY = True\n", " from scipy.stats import norm\n", "except:\n", " _SCIPY = False\n", " warnings.warn(\n", " \"Could not import scipy.stats. Confidence scores \"\n", " \"for GPRegression are restricted to 95% bounds\"\n", " )\n", "\n", "from ..utils.kernels import KernelInitializer\n", "\n", "\n", "class GPRegression:\n", " def __init__(self, kernel=\"RBFKernel\", alpha=1e-10):\n", " \"\"\"\n", " A Gaussian Process (GP) regression model.\n", "\n", " .. math::\n", "\n", " y \\mid X, f &\\sim \\mathcal{N}( [f(x_1), \\ldots, f(x_n)], \\\\alpha I ) \\\\\\\\\n", " f \\mid X &\\sim \\\\text{GP}(0, K)\n", "\n", " for data :math:`D = \\{(x_1, y_1), \\ldots, (x_n, y_n) \\}` and a covariance matrix :math:`K_{ij}\n", " = \\\\text{kernel}(x_i, x_j)` for all :math:`i, j \\in \\{1, \\ldots, n \\}`.\n", "\n", " Parameters\n", " ----------\n", " kernel : str\n", " The kernel to use in fitting the GP prior. Default is 'RBFKernel'.\n", " alpha : float\n", " An isotropic noise term for the diagonal in the GP covariance, `K`.\n", " Larger values correspond to the expectation of greater noise in the\n", " observed data points. Default is 1e-10.\n", " \"\"\"\n", " self.kernel = KernelInitializer(kernel)()\n", " self.parameters = {\"GP_mean\": None, \"GP_cov\": None, \"X\": None}\n", " self.hyperparameters = {\"kernel\": str(self.kernel), \"alpha\": alpha}\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Fit the GP prior to the training data.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A training dataset of `N` examples, each with dimensionality `M`.\n", " y : :py:class:`ndarray ` of shape `(N, O)`\n", " A collection of real-valued training targets for the\n", " examples in `X`, each with dimension `O`.\n", " \"\"\"\n", " mu = np.zeros(X.shape[0])\n", " K = self.kernel(X, X)\n", "\n", " self.parameters[\"X\"] = X\n", " self.parameters[\"y\"] = y\n", " self.parameters[\"GP_cov\"] = K\n", " self.parameters[\"GP_mean\"] = mu\n", "\n", " def predict(self, X, conf_interval=0.95, return_cov=False):\n", " \"\"\"\n", " Return the MAP estimate for :math:`y^*`, corresponding the mean/mode of\n", " the posterior predictive distribution, :math:`p(y^* \\mid x^*, X, y)`.\n", "\n", " Notes\n", " -----\n", " Under the GP regression model, the posterior predictive distribution is\n", "\n", " .. math::\n", "\n", " y^* \\mid x^*, X, y \\sim \\mathcal{N}(\\mu^*, \\\\text{cov}^*)\n", "\n", " where\n", "\n", " .. math::\n", "\n", " \\mu^* &= K^* (K + \\\\alpha I)^{-1} y \\\\\\\\\n", " \\\\text{cov}^* &= K^{**} - K^{*'} (K + \\\\alpha I)^{-1} K^*\n", "\n", " and\n", "\n", " .. math::\n", "\n", " K &= \\\\text{kernel}(X, X) \\\\\\\\\n", " K^* &= \\\\text{kernel}(X, X^*) \\\\\\\\\n", " K^{**} &= \\\\text{kernel}(X^*, X^*)\n", "\n", " NB. This implementation uses the inefficient but general purpose\n", " `np.linalg.inv` routine to invert :math:`(K + \\\\alpha I)`. A more\n", " efficient way is to rely on the fact that `K` (and hence also :math:`K\n", " + \\\\alpha I`) is symmetric positive (semi-)definite and take the inner\n", " product of the inverse of its (lower) Cholesky decompositions:\n", "\n", " .. math::\n", "\n", " Q^{-1} = \\\\text{cholesky}(Q)^{-1 \\\\top} \\\\text{cholesky}(Q)^{-1}\n", "\n", " For more details on a production-grade implementation, see Algorithm\n", " 2.1 in Rasmussen & Williams (2006).\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape (N, M)\n", " The collection of datapoints to generate predictions on\n", " conf_interval : float in (0, 1)\n", " The percentage confidence bound to return for each prediction. If\n", " the scipy package is not available, this value is always set to\n", " 0.95. Default is 0.95.\n", " return_cov : bool\n", " If True, also return the covariance (`cov*`) of the posterior\n", " predictive distribution for the points in `X`. Default is False.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(N, O)`\n", " The predicted values for each point in `X`, each with\n", " dimensionality `O`.\n", " conf : :py:class:`ndarray ` of shape `(N, O)`\n", " The % conf_interval confidence bound for each `y_pred`. The conf %\n", " confidence interval for the `i`'th prediction is ``[y[i] - conf[i],\n", " y[i] + conf[i]]``.\n", " cov : :py:class:`ndarray ` of shape `(N, N)`\n", " The covariance (`cov*`) of the posterior predictive distribution for\n", " `X`. Only returned if `return_cov` is True.\n", " \"\"\"\n", " if conf_interval != 0.95 and not _SCIPY:\n", " fstr = \"Cannot compute {}% confidence score without scipy.stats\"\n", " warnings.warn(fstr.format(conf_interval))\n", "\n", " X_star = X\n", " X = self.parameters[\"X\"]\n", " y = self.parameters[\"y\"]\n", " K = self.parameters[\"GP_cov\"]\n", " alpha = self.hyperparameters[\"alpha\"]\n", "\n", " K_star = self.kernel(X_star, X)\n", " K_star_star = self.kernel(X_star, X_star)\n", "\n", " sig = np.eye(K.shape[0]) * alpha\n", " K_y_inv = inv(K + sig)\n", "\n", " pp_mean = K_star @ K_y_inv @ y\n", " pp_cov = K_star_star - K_star @ K_y_inv @ K_star.T\n", "\n", " # if we can't use scipy, ignore the passed value for `conf_interval`\n", " # and return the 95% confidence bound.\n", " # (norm.ppf == inverse CDF for standard normal)\n", " percentile = 1.96 if not _SCIPY else norm.ppf(conf_interval)\n", " conf = percentile * np.sqrt(np.diag(pp_cov))\n", " return (pp_mean, conf) if not return_cov else (pp_mean, conf, pp_cov)\n", "\n", " def marginal_log_likelihood(self, kernel_params=None):\n", " \"\"\"\n", " Compute the log of the marginal likelihood (i.e., the log model\n", " evidence), :math:`p(y \\mid X, \\\\text{kernel_params})`.\n", "\n", " Notes\n", " -----\n", " Under the GP regression model, the marginal likelihood is normally\n", " distributed:\n", "\n", " .. math::\n", "\n", " y | X, \\\\theta \\sim \\mathcal{N}(0, K + \\\\alpha I)\n", "\n", " Hence,\n", "\n", " .. math::\n", "\n", " \\log p(y \\mid X, \\\\theta) =\n", " -0.5 \\log \\det(K + \\\\alpha I) -\n", " 0.5 y^\\\\top (K + \\\\alpha I)^{-1} y + \\\\frac{n}{2} \\log 2 \\pi\n", "\n", " where :math:`K = \\\\text{kernel}(X, X)`, :math:`\\\\theta` is the set of\n", " kernel parameters, and `n` is the number of dimensions in `K`.\n", "\n", " Parameters\n", " ----------\n", " kernel_params : dict\n", " Parameters for the kernel function. If None, calculate the\n", " marginal likelihood under the kernel parameters defined at model\n", " initialization. Default is None.\n", "\n", " Returns\n", " -------\n", " marginal_log_likelihood : float\n", " The log likelihood of the training targets given the kernel\n", " parameterized by `kernel_params` and the training inputs,\n", " marginalized over all functions `f`.\n", " \"\"\"\n", " X = self.parameters[\"X\"]\n", " y = self.parameters[\"y\"]\n", " alpha = self.hyperparameters[\"alpha\"]\n", "\n", " K = self.parameters[\"GP_cov\"]\n", " if kernel_params is not None:\n", " # create a new kernel with parameters `kernel_params` and recalc\n", " # the GP covariance matrix\n", " summary_dict = self.kernel.summary_dict()\n", " summary_dict[\"parameters\"].update(kernel_params)\n", " kernel = KernelInitializer(summary_dict)()\n", " K = kernel(X, X)\n", "\n", " # add isotropic noise to kernel diagonal\n", " K += np.eye(K.shape[0]) * alpha\n", "\n", " Kinv = inv(K)\n", " Klogdet = -0.5 * slogdet(K)[1]\n", " const = K.shape[0] / 2 * np.log(2 * np.pi)\n", "\n", " # handle both uni- and multidimensional target values\n", " if y.ndim == 1:\n", " y = y[:, np.newaxis]\n", "\n", " # sum over each dimension of y\n", " marginal_ll = np.sum([Klogdet - 0.5 * _y.T @ Kinv @ _y - const for _y in y.T])\n", " return marginal_ll\n", "\n", " def sample(self, X, n_samples=1, dist=\"posterior_predictive\"):\n", " \"\"\"\n", " Sample functions from the GP prior or posterior predictive\n", " distribution.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " The collection of datapoints to generate predictions on. Only used if\n", " `dist` = 'posterior_predictive'.\n", " n_samples: int\n", " The number of samples to generate. Default is 1.\n", " dist : {\"posterior_predictive\", \"prior\"}\n", " The distribution to draw samples from. Default is\n", " \"posterior_predictive\".\n", "\n", " Returns\n", " -------\n", " samples : :py:class:`ndarray ` of shape `(n_samples, O, N)`\n", " The generated samples for the points in `X`.\n", " \"\"\"\n", " mvnorm = np.random.multivariate_normal\n", "\n", " if dist == \"prior\":\n", " mu = np.zeros((X.shape[0], 1))\n", " cov = self.kernel(X, X)\n", " elif dist == \"posterior_predictive\":\n", " mu, _, cov = self.predict(X, return_cov=True)\n", " else:\n", " raise ValueError(\"Unrecognized dist: '{}'\".format(dist))\n", "\n", " if mu.ndim == 1:\n", " mu = mu[:, np.newaxis]\n", "\n", " samples = np.array([mvnorm(_mu, cov, size=n_samples) for _mu in mu.T])\n", " return samples.swapaxes(0, 1)\n"]} {"path": "numpy_ml/nonparametric/__init__.py", "content": ["\"\"\"\n", "Popular nonparameteric regression and classification models.\n", "\n", "The nonparametric module contains an assortment of nonparametric models that\n", "don't fit elsewhere in the package. For other nonparametric models, see the\n", "``numpy_ml.trees`` module.\n", "\"\"\"\n", "\n", "from .gp import *\n", "from .knn import *\n", "from .kernel_regression import *\n"]} {"path": "numpy_ml/nonparametric/knn.py", "content": ["\"\"\"A k-Nearest Neighbors (KNN) model for both classiciation and regression.\"\"\"\n", "from collections import Counter\n", "\n", "import numpy as np\n", "\n", "from ..utils.data_structures import BallTree\n", "\n", "\n", "class KNN:\n", " def __init__(\n", " self, k=5, leaf_size=40, classifier=True, metric=None, weights=\"uniform\",\n", " ):\n", " \"\"\"\n", " A `k`-nearest neighbors (kNN) model relying on a ball tree for efficient\n", " computation.\n", "\n", " Parameters\n", " ----------\n", " k : int\n", " The number of neighbors to use during prediction. Default is 5.\n", " leaf_size : int\n", " The maximum number of datapoints at each leaf in the ball tree.\n", " Default is 40.\n", " classifier : bool\n", " Whether to treat the values in Y as class labels (classifier =\n", " True) or real-valued targets (classifier = False). Default is True.\n", " metric : :doc:`Distance metric ` or None\n", " The distance metric to use for computing nearest neighbors. If\n", " None, use the :func:`~numpy_ml.utils.distance_metrics.euclidean`\n", " metric by default. Default is None.\n", " weights : {'uniform', 'distance'}\n", " How to weight the predictions from each neighbors. 'uniform'\n", " assigns uniform weights to each neighbor, while 'distance' assigns\n", " weights proportional to the inverse of the distance from the query\n", " point. Default is 'uniform'.\n", " \"\"\"\n", " self._ball_tree = BallTree(leaf_size=leaf_size, metric=metric)\n", " self.hyperparameters = {\n", " \"id\": \"KNN\",\n", " \"k\": k,\n", " \"leaf_size\": leaf_size,\n", " \"classifier\": classifier,\n", " \"metric\": str(metric),\n", " \"weights\": weights,\n", " }\n", "\n", " def fit(self, X, y):\n", " r\"\"\"\n", " Fit the model to the data and targets in `X` and `y`\n", "\n", " Parameters\n", " ----------\n", " X : numpy array of shape `(N, M)`\n", " An array of `N` examples to generate predictions on.\n", " y : numpy array of shape `(N, *)`\n", " Targets for the `N` rows in `X`.\n", " \"\"\"\n", " if X.ndim != 2:\n", " raise Exception(\"X must be two-dimensional\")\n", " self._ball_tree.fit(X, y)\n", "\n", " def predict(self, X):\n", " r\"\"\"\n", " Generate predictions for the targets associated with the rows in `X`.\n", "\n", " Parameters\n", " ----------\n", " X : numpy array of shape `(N', M')`\n", " An array of `N'` examples to generate predictions on.\n", "\n", " Returns\n", " -------\n", " y : numpy array of shape `(N', *)`\n", " Predicted targets for the `N'` rows in `X`.\n", " \"\"\"\n", " predictions = []\n", " H = self.hyperparameters\n", " for x in X:\n", " pred = None\n", " nearest = self._ball_tree.nearest_neighbors(H[\"k\"], x)\n", " targets = [n.val for n in nearest]\n", "\n", " if H[\"classifier\"]:\n", " if H[\"weights\"] == \"uniform\":\n", " # for consistency with sklearn / scipy.stats.mode, return\n", " # the smallest class ID in the event of a tie\n", " counts = Counter(targets).most_common()\n", " pred, _ = sorted(counts, key=lambda x: (-x[1], x[0]))[0]\n", " elif H[\"weights\"] == \"distance\":\n", " best_score = -np.inf\n", " for label in set(targets):\n", " scores = [1 / n.distance for n in nearest if n.val == label]\n", " pred = label if np.sum(scores) > best_score else pred\n", " else:\n", " if H[\"weights\"] == \"uniform\":\n", " pred = np.mean(targets)\n", " elif H[\"weights\"] == \"distance\":\n", " weights = [1 / n.distance for n in nearest]\n", " pred = np.average(targets, weights=weights)\n", " predictions.append(pred)\n", " return np.array(predictions)\n"]} {"path": "numpy_ml/neural_nets/__init__.py", "content": ["\"\"\"A module of basic building blcoks for constructing neural networks\"\"\"\n", "from . import utils\n", "from . import losses\n", "from . import activations\n", "from . import schedulers\n", "from . import optimizers\n", "from . import wrappers\n", "from . import layers\n", "from . import initializers\n", "from . import modules\n", "from . import models\n"]} {"path": "numpy_ml/neural_nets/activations/activations.py", "content": ["\"\"\"A collection of activation function objects for building neural networks\"\"\"\n", "from math import erf\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "\n", "class ActivationBase(ABC):\n", " def __init__(self, **kwargs):\n", " \"\"\"Initialize the ActivationBase object\"\"\"\n", " super().__init__()\n", "\n", " def __call__(self, z):\n", " \"\"\"Apply the activation function to an input\"\"\"\n", " if z.ndim == 1:\n", " z = z.reshape(1, -1)\n", " return self.fn(z)\n", "\n", " @abstractmethod\n", " def fn(self, z):\n", " \"\"\"Apply the activation function to an input\"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def grad(self, x, **kwargs):\n", " \"\"\"Compute the gradient of the activation function wrt the input\"\"\"\n", " raise NotImplementedError\n", "\n", "\n", "class Sigmoid(ActivationBase):\n", " def __init__(self):\n", " \"\"\"A logistic sigmoid activation function.\"\"\"\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Sigmoid\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the logistic sigmoid, :math:`\\sigma`, on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\sigma(x_i) = \\frac{1}{1 + e^{-x_i}}\n", " \"\"\"\n", " return 1 / (1 + np.exp(-z))\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the logistic sigmoid on the elements of `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\sigma}{\\partial x_i} = \\sigma(x_i) (1 - \\sigma(x_i))\n", " \"\"\"\n", " fn_x = self.fn(x)\n", " return fn_x * (1 - fn_x)\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the logistic sigmoid on the elements of `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\sigma}{\\partial x_i^2} =\n", " \\frac{\\partial \\sigma}{\\partial x_i} (1 - 2 \\sigma(x_i))\n", " \"\"\"\n", " fn_x = self.fn(x)\n", " return fn_x * (1 - fn_x) * (1 - 2 * fn_x)\n", "\n", "\n", "class ReLU(ActivationBase):\n", " \"\"\"\n", " A rectified linear activation function.\n", "\n", " Notes\n", " -----\n", " \"ReLU units can be fragile during training and can \"die\". For example, a\n", " large gradient flowing through a ReLU neuron could cause the weights to\n", " update in such a way that the neuron will never activate on any datapoint\n", " again. If this happens, then the gradient flowing through the unit will\n", " forever be zero from that point on. That is, the ReLU units can\n", " irreversibly die during training since they can get knocked off the data\n", " manifold.\n", "\n", " For example, you may find that as much as 40% of your network can be \"dead\"\n", " (i.e. neurons that never activate across the entire training dataset) if\n", " the learning rate is set too high. With a proper setting of the learning\n", " rate this is less frequently an issue.\" [*]_\n", "\n", " References\n", " ----------\n", " .. [*] Karpathy, A. \"CS231n: Convolutional neural networks for visual recognition.\"\n", " \"\"\"\n", "\n", " def __init__(self):\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"ReLU\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaulate the ReLU function on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{ReLU}(z_i)\n", " &= z_i \\ \\ \\ \\ &&\\text{if }z_i > 0 \\\\\n", " &= 0 \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " return np.clip(z, 0, np.inf)\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaulate the first derivative of the ReLU function on the elements of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{ReLU}}{\\partial x_i}\n", " &= 1 \\ \\ \\ \\ &&\\text{if }x_i > 0 \\\\\n", " &= 0 \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " return (x > 0).astype(int)\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaulate the second derivative of the ReLU function on the elements of\n", " input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{ReLU}}{\\partial x_i^2} = 0\n", " \"\"\"\n", " return np.zeros_like(x)\n", "\n", "\n", "class LeakyReLU(ActivationBase):\n", " \"\"\"\n", " 'Leaky' version of a rectified linear unit (ReLU).\n", "\n", " Notes\n", " -----\n", " Leaky ReLUs [*]_ are designed to address the vanishing gradient problem in\n", " ReLUs by allowing a small non-zero gradient when `x` is negative.\n", "\n", " Parameters\n", " ----------\n", " alpha: float\n", " Activation slope when x < 0. Default is 0.3.\n", "\n", " References\n", " ----------\n", " .. [*] Mass, L. M., Hannun, A. Y, & Ng, A. Y. (2013). \"Rectifier\n", " nonlinearities improve neural network acoustic models.\" *Proceedings of\n", " the 30th International Conference of Machine Learning, 30*.\n", " \"\"\"\n", "\n", " def __init__(self, alpha=0.3):\n", " self.alpha = alpha\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Leaky ReLU(alpha={})\".format(self.alpha)\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the leaky ReLU function on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{LeakyReLU}(z_i)\n", " &= z_i \\ \\ \\ \\ &&\\text{if } z_i > 0 \\\\\n", " &= \\alpha z_i \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " _z = z.copy()\n", " _z[z < 0] = _z[z < 0] * self.alpha\n", " return _z\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the leaky ReLU function on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{LeakyReLU}}{\\partial x_i}\n", " &= 1 \\ \\ \\ \\ &&\\text{if }x_i > 0 \\\\\n", " &= \\alpha \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " out = np.ones_like(x)\n", " out[x < 0] *= self.alpha\n", " return out\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the leaky ReLU function on the\n", " elements of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{LeakyReLU}}{\\partial x_i^2} = 0\n", " \"\"\"\n", " return np.zeros_like(x)\n", "\n", "\n", "class GELU(ActivationBase):\n", " def __init__(self, approximate=True):\n", " r\"\"\"\n", " A Gaussian error linear unit (GELU). [*]_\n", "\n", " Notes\n", " -----\n", " A ReLU alternative. GELU weights inputs by their value, rather than\n", " gates inputs by their sign, as in vanilla ReLUs.\n", "\n", " References\n", " ----------\n", " .. [*] Hendrycks, D., & Gimpel, K. (2016). \"Bridging nonlinearities and\n", " stochastic regularizers with Gaussian error linear units.\" *CoRR*.\n", "\n", " Parameters\n", " ----------\n", " approximate : bool\n", " Whether to use a faster but less precise approximation to the Gauss\n", " error function when calculating the unit activation and gradient.\n", " Default is True.\n", " \"\"\"\n", " self.approximate = True\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return f\"GELU(approximate={self.approximate})\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Compute the GELU function on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{GELU}(z_i) = z_i P(Z \\leq z_i) = z_i \\Phi(z_i)\n", " = z_i \\cdot \\frac{1}{2}(1 + \\text{erf}(x/\\sqrt{2}))\n", " \"\"\"\n", " pi, sqrt, tanh = np.pi, np.sqrt, np.tanh\n", "\n", " if self.approximate:\n", " return 0.5 * z * (1 + tanh(sqrt(2 / pi) * (z + 0.044715 * z ** 3)))\n", " return 0.5 * z * (1 + erf(z / sqrt(2)))\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the GELU function on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{GELU}}{\\partial x_i} =\n", " \\frac{1}{2} + \\frac{1}{2}\\left(\\text{erf}(\\frac{x}{\\sqrt{2}}) +\n", " \\frac{x + \\text{erf}'(\\frac{x}{\\sqrt{2}})}{\\sqrt{2}}\\right)\n", "\n", " where :math:`\\text{erf}'(x) = \\frac{2}{\\sqrt{\\pi}} \\cdot \\exp\\{-x^2\\}`.\n", " \"\"\"\n", " pi, exp, sqrt, tanh = np.pi, np.exp, np.sqrt, np.tanh\n", "\n", " s = x / sqrt(2)\n", " erf_prime = lambda x: (2 / sqrt(pi)) * exp(-(x ** 2)) # noqa: E731\n", "\n", " if self.approximate:\n", " approx = tanh(sqrt(2 / pi) * (x + 0.044715 * x ** 3))\n", " dx = 0.5 + 0.5 * approx + ((0.5 * x * erf_prime(s)) / sqrt(2))\n", " else:\n", " dx = 0.5 + 0.5 * erf(s) + ((0.5 * x * erf_prime(s)) / sqrt(2))\n", " return dx\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the GELU function on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{GELU}}{\\partial x_i^2} =\n", " \\frac{1}{2\\sqrt{2}} \\left\\[\n", " \\text{erf}'(\\frac{x}{\\sqrt{2}}) +\n", " \\frac{1}{\\sqrt{2}} \\text{erf}''(\\frac{x}{\\sqrt{2}})\n", " \\right]\n", "\n", " where :math:`\\text{erf}'(x) = \\frac{2}{\\sqrt{\\pi}} \\cdot \\exp\\{-x^2\\}` and\n", " :math:`\\text{erf}''(x) = \\frac{-4x}{\\sqrt{\\pi}} \\cdot \\exp\\{-x^2\\}`.\n", " \"\"\"\n", " pi, exp, sqrt = np.pi, np.exp, np.sqrt\n", " s = x / sqrt(2)\n", "\n", " erf_prime = lambda x: (2 / sqrt(pi)) * exp(-(x ** 2)) # noqa: E731\n", " erf_prime2 = lambda x: -4 * x * exp(-(x ** 2)) / sqrt(pi) # noqa: E731\n", " ddx = (1 / 2 * sqrt(2)) * (1 + erf_prime(s) + (erf_prime2(s) / sqrt(2)))\n", " return ddx\n", "\n", "\n", "class Tanh(ActivationBase):\n", " def __init__(self):\n", " \"\"\"A hyperbolic tangent activation function.\"\"\"\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Tanh\"\n", "\n", " def fn(self, z):\n", " \"\"\"Compute the tanh function on the elements of input `z`.\"\"\"\n", " return np.tanh(z)\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the tanh function on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\tanh}{\\partial x_i} = 1 - \\tanh(x)^2\n", " \"\"\"\n", " return 1 - np.tanh(x) ** 2\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the tanh function on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\tanh}{\\partial x_i^2} =\n", " -2 \\tanh(x) \\left(\\frac{\\partial \\tanh}{\\partial x_i}\\right)\n", " \"\"\"\n", " tanh_x = np.tanh(x)\n", " return -2 * tanh_x * (1 - tanh_x ** 2)\n", "\n", "\n", "class Affine(ActivationBase):\n", " def __init__(self, slope=1, intercept=0):\n", " \"\"\"\n", " An affine activation function.\n", "\n", " Parameters\n", " ----------\n", " slope: float\n", " Activation slope. Default is 1.\n", " intercept: float\n", " Intercept/offset term. Default is 0.\n", " \"\"\"\n", " self.slope = slope\n", " self.intercept = intercept\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Affine(slope={}, intercept={})\".format(self.slope, self.intercept)\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the Affine activation on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{Affine}(z_i) = \\text{slope} \\times z_i + \\text{intercept}\n", " \"\"\"\n", " return self.slope * z + self.intercept\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the Affine activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{Affine}}{\\partial x_i} = \\text{slope}\n", " \"\"\"\n", " return self.slope * np.ones_like(x)\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the Affine activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{Affine}}{\\partial x_i^2} = 0\n", " \"\"\"\n", " return np.zeros_like(x)\n", "\n", "\n", "class Identity(Affine):\n", " def __init__(self):\n", " \"\"\"\n", " Identity activation function.\n", "\n", " Notes\n", " -----\n", " :class:`Identity` is syntactic sugar for :class:`Affine` with\n", " slope = 1 and intercept = 0.\n", " \"\"\"\n", " super().__init__(slope=1, intercept=0)\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Identity\"\n", "\n", "\n", "class ELU(ActivationBase):\n", " def __init__(self, alpha=1.0):\n", " r\"\"\"\n", " An exponential linear unit (ELU).\n", "\n", " Notes\n", " -----\n", " ELUs are intended to address the fact that ReLUs are strictly nonnegative\n", " and thus have an average activation > 0, increasing the chances of internal\n", " covariate shift and slowing down learning. ELU units address this by (1)\n", " allowing negative values when :math:`x < 0`, which (2) are bounded by a value\n", " :math:`-\\alpha`. Similar to :class:`LeakyReLU`, the negative activation\n", " values help to push the average unit activation towards 0. Unlike\n", " :class:`LeakyReLU`, however, the boundedness of the negative activation\n", " allows for greater robustness in the face of large negative values,\n", " allowing the function to avoid conveying the *degree* of \"absence\"\n", " (negative activation) in the input. [*]_\n", "\n", " Parameters\n", " ----------\n", " alpha : float\n", " Slope of negative segment. Default is 1.\n", "\n", " References\n", " ----------\n", " .. [*] Clevert, D. A., Unterthiner, T., Hochreiter, S. (2016). \"Fast\n", " and accurate deep network learning by exponential linear units\n", " (ELUs)\". *4th International Conference on Learning\n", " Representations*.\n", " \"\"\"\n", " self.alpha = alpha\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"ELU(alpha={})\".format(self.alpha)\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the ELU activation on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{ELU}(z_i)\n", " &= z_i \\ \\ \\ \\ &&\\text{if }z_i > 0 \\\\\n", " &= \\alpha (e^{z_i} - 1) \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " # z if z > 0 else alpha * (e^z - 1)\n", " return np.where(z > 0, z, self.alpha * (np.exp(z) - 1))\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the ELU activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{ELU}}{\\partial x_i}\n", " &= 1 \\ \\ \\ \\ &&\\text{if } x_i > 0 \\\\\n", " &= \\alpha e^{x_i} \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " # 1 if x > 0 else alpha * e^(z)\n", " return np.where(x > 0, np.ones_like(x), self.alpha * np.exp(x))\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the ELU activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{ELU}}{\\partial x_i^2}\n", " &= 0 \\ \\ \\ \\ &&\\text{if } x_i > 0 \\\\\n", " &= \\alpha e^{x_i} \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " # 0 if x > 0 else alpha * e^(z)\n", " return np.where(x >= 0, np.zeros_like(x), self.alpha * np.exp(x))\n", "\n", "\n", "class Exponential(ActivationBase):\n", " def __init__(self):\n", " \"\"\"An exponential (base e) activation function\"\"\"\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Exponential\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the activation function\n", "\n", " .. math::\n", " \\text{Exponential}(z_i) = e^{z_i}\n", " \"\"\"\n", " return np.exp(z)\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the exponential activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{Exponential}}{\\partial x_i} = e^{x_i}\n", " \"\"\"\n", " return np.exp(x)\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the exponential activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{Exponential}}{\\partial x_i^2} = e^{x_i}\n", " \"\"\"\n", " return np.exp(x)\n", "\n", "\n", "class SELU(ActivationBase):\n", " r\"\"\"\n", " A scaled exponential linear unit (SELU).\n", "\n", " Notes\n", " -----\n", " SELU units, when used in conjunction with proper weight initialization and\n", " regularization techniques, encourage neuron activations to converge to\n", " zero-mean and unit variance without explicit use of e.g., batchnorm.\n", "\n", " For SELU units, the :math:`\\alpha` and :math:`\\text{scale}` values are\n", " constants chosen so that the mean and variance of the inputs are preserved\n", " between consecutive layers. As such the authors propose weights be\n", " initialized using Lecun-Normal initialization: :math:`w_{ij} \\sim\n", " \\mathcal{N}(0, 1 / \\text{fan_in})`, and to use the dropout variant\n", " :math:`\\alpha`-dropout during regularization. [*]_\n", "\n", " See the reference for more information (especially the appendix ;-) ).\n", "\n", " References\n", " ----------\n", " .. [*] Klambauer, G., Unterthiner, T., & Hochreiter, S. (2017).\n", " \"Self-normalizing neural networks.\" *Advances in Neural Information\n", " Processing Systems, 30.*\n", " \"\"\"\n", "\n", " def __init__(self):\n", " self.alpha = 1.6732632423543772848170429916717\n", " self.scale = 1.0507009873554804934193349852946\n", " self.elu = ELU(alpha=self.alpha)\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"SELU\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the SELU activation on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{SELU}(z_i) = \\text{scale} \\times \\text{ELU}(z_i, \\alpha)\n", "\n", " which is simply\n", "\n", " .. math::\n", "\n", " \\text{SELU}(z_i)\n", " &= \\text{scale} \\times z_i \\ \\ \\ \\ &&\\text{if }z_i > 0 \\\\\n", " &= \\text{scale} \\times \\alpha (e^{z_i} - 1) \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " return self.scale * self.elu.fn(z)\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the SELU activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{SELU}}{\\partial x_i}\n", " &= \\text{scale} \\ \\ \\ \\ &&\\text{if } x_i > 0 \\\\\n", " &= \\text{scale} \\times \\alpha e^{x_i} \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " return np.where(\n", " x >= 0, np.ones_like(x) * self.scale, np.exp(x) * self.alpha * self.scale,\n", " )\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the SELU activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{SELU}}{\\partial x_i^2}\n", " &= 0 \\ \\ \\ \\ &&\\text{if } x_i > 0 \\\\\n", " &= \\text{scale} \\times \\alpha e^{x_i} \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " return np.where(x > 0, np.zeros_like(x), np.exp(x) * self.alpha * self.scale)\n", "\n", "\n", "class HardSigmoid(ActivationBase):\n", " def __init__(self):\n", " \"\"\"\n", " A \"hard\" sigmoid activation function.\n", "\n", " Notes\n", " -----\n", " The hard sigmoid is a piecewise linear approximation of the logistic\n", " sigmoid that is computationally more efficient to compute.\n", " \"\"\"\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"Hard Sigmoid\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the hard sigmoid activation on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{HardSigmoid}(z_i)\n", " &= 0 \\ \\ \\ \\ &&\\text{if }z_i < -2.5 \\\\\n", " &= 0.2 z_i + 0.5 \\ \\ \\ \\ &&\\text{if }-2.5 \\leq z_i \\leq 2.5 \\\\\n", " &= 1 \\ \\ \\ \\ &&\\text{if }z_i > 2.5\n", " \"\"\"\n", " return np.clip((0.2 * z) + 0.5, 0.0, 1.0)\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the hard sigmoid activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{HardSigmoid}}{\\partial x_i}\n", " &= 0.2 \\ \\ \\ \\ &&\\text{if } -2.5 \\leq x_i \\leq 2.5\\\\\n", " &= 0 \\ \\ \\ \\ &&\\text{otherwise}\n", " \"\"\"\n", " return np.where((x >= -2.5) & (x <= 2.5), 0.2, 0)\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the hard sigmoid activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{HardSigmoid}}{\\partial x_i^2} = 0\n", " \"\"\"\n", " return np.zeros_like(x)\n", "\n", "\n", "class SoftPlus(ActivationBase):\n", " def __init__(self):\n", " \"\"\"\n", " A softplus activation function.\n", "\n", " Notes\n", " -----\n", " In contrast to :class:`ReLU`, the softplus activation is differentiable\n", " everywhere (including 0). It is, however, less computationally efficient to\n", " compute.\n", "\n", " The derivative of the softplus activation is the logistic sigmoid.\n", " \"\"\"\n", " super().__init__()\n", "\n", " def __str__(self):\n", " \"\"\"Return a string representation of the activation function\"\"\"\n", " return \"SoftPlus\"\n", "\n", " def fn(self, z):\n", " r\"\"\"\n", " Evaluate the softplus activation on the elements of input `z`.\n", "\n", " .. math::\n", "\n", " \\text{SoftPlus}(z_i) = \\log(1 + e^{z_i})\n", " \"\"\"\n", " return np.log(np.exp(z) + 1)\n", "\n", " def grad(self, x):\n", " r\"\"\"\n", " Evaluate the first derivative of the softplus activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial \\text{SoftPlus}}{\\partial x_i} = \\frac{e^{x_i}}{1 + e^{x_i}}\n", " \"\"\"\n", " exp_x = np.exp(x)\n", " return exp_x / (exp_x + 1)\n", "\n", " def grad2(self, x):\n", " r\"\"\"\n", " Evaluate the second derivative of the softplus activation on the elements\n", " of input `x`.\n", "\n", " .. math::\n", "\n", " \\frac{\\partial^2 \\text{SoftPlus}}{\\partial x_i^2} =\n", " \\frac{e^{x_i}}{(1 + e^{x_i})^2}\n", " \"\"\"\n", " exp_x = np.exp(x)\n", " return exp_x / ((exp_x + 1) ** 2)\n"]} {"path": "numpy_ml/neural_nets/activations/__init__.py", "content": ["from .activations import *\n"]} {"path": "numpy_ml/neural_nets/losses/__init__.py", "content": ["\"\"\"\n", "Common neural network loss functions.\n", "\n", "This module implements loss objects that can be used during neural network\n", "training.\n", "\"\"\"\n", "\n", "from .losses import *\n"]} {"path": "numpy_ml/neural_nets/losses/losses.py", "content": ["from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "from ...utils.testing import is_binary, is_stochastic\n", "from ..initializers import (\n", " WeightInitializer,\n", " ActivationInitializer,\n", " OptimizerInitializer,\n", ")\n", "\n", "\n", "class ObjectiveBase(ABC):\n", " def __init__(self):\n", " super().__init__()\n", "\n", " @abstractmethod\n", " def loss(self, y_true, y_pred):\n", " pass\n", "\n", " @abstractmethod\n", " def grad(self, y_true, y_pred, **kwargs):\n", " pass\n", "\n", "\n", "class SquaredError(ObjectiveBase):\n", " def __init__(self):\n", " \"\"\"\n", " A squared-error / `L2` loss.\n", "\n", " Notes\n", " -----\n", " For real-valued target **y** and predictions :math:`\\hat{\\mathbf{y}}`, the\n", " squared error is\n", "\n", " .. math::\n", " \\mathcal{L}(\\mathbf{y}, \\hat{\\mathbf{y}})\n", " = 0.5 ||\\hat{\\mathbf{y}} - \\mathbf{y}||_2^2\n", " \"\"\"\n", " super().__init__()\n", "\n", " def __call__(self, y, y_pred):\n", " return self.loss(y, y_pred)\n", "\n", " def __str__(self):\n", " return \"SquaredError\"\n", "\n", " @staticmethod\n", " def loss(y, y_pred):\n", " \"\"\"\n", " Compute the squared error between `y` and `y_pred`.\n", "\n", " Parameters\n", " ----------\n", " y : :py:class:`ndarray ` of shape (n, m)\n", " Ground truth values for each of `n` examples\n", " y_pred : :py:class:`ndarray ` of shape (n, m)\n", " Predictions for the `n` examples in the batch.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " The sum of the squared error across dimensions and examples.\n", " \"\"\"\n", " return 0.5 * np.linalg.norm(y_pred - y) ** 2\n", "\n", " @staticmethod\n", " def grad(y, y_pred, z, act_fn):\n", " \"\"\"\n", " Gradient of the squared error loss with respect to the pre-nonlinearity\n", " input, `z`.\n", "\n", " Notes\n", " -----\n", " The current method computes the gradient :math:`\\\\frac{\\partial\n", " \\mathcal{L}}{\\partial \\mathbf{z}}`, where\n", "\n", " .. math::\n", "\n", " \\mathcal{L}(\\mathbf{z})\n", " &= \\\\text{squared_error}(\\mathbf{y}, g(\\mathbf{z})) \\\\\\\\\n", " g(\\mathbf{z})\n", " &= \\\\text{act_fn}(\\mathbf{z})\n", "\n", " The gradient with respect to :math:`\\mathbf{z}` is then\n", "\n", " .. math::\n", "\n", " \\\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{z}}\n", " = (g(\\mathbf{z}) - \\mathbf{y}) \\left(\n", " \\\\frac{\\partial g}{\\partial \\mathbf{z}} \\\\right)\n", "\n", " Parameters\n", " ----------\n", " y : :py:class:`ndarray ` of shape (n, m)\n", " Ground truth values for each of `n` examples.\n", " y_pred : :py:class:`ndarray ` of shape (n, m)\n", " Predictions for the `n` examples in the batch.\n", " act_fn : :doc:`Activation ` object\n", " The activation function for the output layer of the network.\n", "\n", " Returns\n", " -------\n", " grad : :py:class:`ndarray ` of shape (n, m)\n", " The gradient of the squared error loss with respect to `z`.\n", " \"\"\"\n", " return (y_pred - y) * act_fn.grad(z)\n", "\n", "\n", "class CrossEntropy(ObjectiveBase):\n", " def __init__(self):\n", " \"\"\"\n", " A cross-entropy loss.\n", "\n", " Notes\n", " -----\n", " For a one-hot target **y** and predicted class probabilities\n", " :math:`\\hat{\\mathbf{y}}`, the cross entropy is\n", "\n", " .. math::\n", " \\mathcal{L}(\\mathbf{y}, \\hat{\\mathbf{y}})\n", " = \\sum_i y_i \\log \\hat{y}_i\n", " \"\"\"\n", " super().__init__()\n", "\n", " def __call__(self, y, y_pred):\n", " return self.loss(y, y_pred)\n", "\n", " def __str__(self):\n", " return \"CrossEntropy\"\n", "\n", " @staticmethod\n", " def loss(y, y_pred):\n", " \"\"\"\n", " Compute the cross-entropy (log) loss.\n", "\n", " Notes\n", " -----\n", " This method returns the sum (not the average!) of the losses for each\n", " sample.\n", "\n", " Parameters\n", " ----------\n", " y : :py:class:`ndarray ` of shape (n, m)\n", " Class labels (one-hot with `m` possible classes) for each of `n`\n", " examples.\n", " y_pred : :py:class:`ndarray ` of shape (n, m)\n", " Probabilities of each of `m` classes for the `n` examples in the\n", " batch.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " The sum of the cross-entropy across classes and examples.\n", " \"\"\"\n", " is_binary(y)\n", " is_stochastic(y_pred)\n", "\n", " # prevent taking the log of 0\n", " eps = np.finfo(float).eps\n", "\n", " # each example is associated with a single class; sum the negative log\n", " # probability of the correct label over all samples in the batch.\n", " # observe that we are taking advantage of the fact that y is one-hot\n", " # encoded\n", " cross_entropy = -np.sum(y * np.log(y_pred + eps))\n", " return cross_entropy\n", "\n", " @staticmethod\n", " def grad(y, y_pred):\n", " \"\"\"\n", " Compute the gradient of the cross entropy loss with regard to the\n", " softmax input, `z`.\n", "\n", " Notes\n", " -----\n", " The gradient for this method goes through both the cross-entropy loss\n", " AND the softmax non-linearity to return :math:`\\\\frac{\\partial\n", " \\mathcal{L}}{\\partial \\mathbf{z}}` (rather than :math:`\\\\frac{\\partial\n", " \\mathcal{L}}{\\partial \\\\text{softmax}(\\mathbf{z})}`).\n", "\n", " In particular, let:\n", "\n", " .. math::\n", "\n", " \\mathcal{L}(\\mathbf{z})\n", " = \\\\text{cross_entropy}(\\\\text{softmax}(\\mathbf{z})).\n", "\n", " The current method computes:\n", "\n", " .. math::\n", "\n", " \\\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{z}}\n", " &= \\\\text{softmax}(\\mathbf{z}) - \\mathbf{y} \\\\\\\\\n", " &= \\hat{\\mathbf{y}} - \\mathbf{y}\n", "\n", " Parameters\n", " ----------\n", " y : :py:class:`ndarray ` of shape `(n, m)`\n", " A one-hot encoding of the true class labels. Each row constitues a\n", " training example, and each column is a different class.\n", " y_pred: :py:class:`ndarray ` of shape `(n, m)`\n", " The network predictions for the probability of each of `m` class\n", " labels on each of `n` examples in a batch.\n", "\n", " Returns\n", " -------\n", " grad : :py:class:`ndarray ` of shape (n, m)\n", " The gradient of the cross-entropy loss with respect to the *input*\n", " to the softmax function.\n", " \"\"\"\n", " is_binary(y)\n", " is_stochastic(y_pred)\n", "\n", " # derivative of xe wrt z is y_pred - y_true, hence we can just\n", " # subtract 1 from the probability of the correct class labels\n", " grad = y_pred - y\n", "\n", " # [optional] scale the gradients by the number of examples in the batch\n", " # n, m = y.shape\n", " # grad /= n\n", " return grad\n", "\n", "\n", "class VAELoss(ObjectiveBase):\n", " def __init__(self):\n", " \"\"\"\n", " The variational lower bound for a variational autoencoder with Bernoulli\n", " units.\n", "\n", " Notes\n", " -----\n", " The VLB to the sum of the binary cross entropy between the true input and\n", " the predicted output (the \"reconstruction loss\") and the KL divergence\n", " between the learned variational distribution :math:`q` and the prior,\n", " :math:`p`, assumed to be a unit Gaussian.\n", "\n", " .. math::\n", "\n", " \\\\text{VAELoss} =\n", " \\\\text{cross_entropy}(\\mathbf{y}, \\hat{\\mathbf{y}})\n", " + \\\\mathbb{KL}[q \\ || \\ p]\n", "\n", " where :math:`\\mathbb{KL}[q \\ || \\ p]` is the Kullback-Leibler\n", " divergence between the distributions :math:`q` and :math:`p`.\n", "\n", " References\n", " ----------\n", " .. [1] Kingma, D. P. & Welling, M. (2014). \"Auto-encoding variational Bayes\".\n", " *arXiv preprint arXiv:1312.6114.* https://arxiv.org/pdf/1312.6114.pdf\n", " \"\"\"\n", " super().__init__()\n", "\n", " def __call__(self, y, y_pred, t_mean, t_log_var):\n", " return self.loss(y, y_pred, t_mean, t_log_var)\n", "\n", " def __str__(self):\n", " return \"VAELoss\"\n", "\n", " @staticmethod\n", " def loss(y, y_pred, t_mean, t_log_var):\n", " \"\"\"\n", " Variational lower bound for a Bernoulli VAE.\n", "\n", " Parameters\n", " ----------\n", " y : :py:class:`ndarray ` of shape `(n_ex, N)`\n", " The original images.\n", " y_pred : :py:class:`ndarray ` of shape `(n_ex, N)`\n", " The VAE reconstruction of the images.\n", " t_mean: :py:class:`ndarray ` of shape `(n_ex, T)`\n", " Mean of the variational distribution :math:`q(t \\mid x)`.\n", " t_log_var: :py:class:`ndarray ` of shape `(n_ex, T)`\n", " Log of the variance vector of the variational distribution\n", " :math:`q(t \\mid x)`.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " The VLB, averaged across the batch.\n", " \"\"\"\n", " # prevent nan on log(0)\n", " eps = np.finfo(float).eps\n", " y_pred = np.clip(y_pred, eps, 1 - eps)\n", "\n", " # reconstruction loss: binary cross-entropy\n", " rec_loss = -np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred), axis=1)\n", "\n", " # KL divergence between the variational distribution q and the prior p,\n", " # a unit gaussian\n", " kl_loss = -0.5 * np.sum(1 + t_log_var - t_mean ** 2 - np.exp(t_log_var), axis=1)\n", " loss = np.mean(kl_loss + rec_loss)\n", " return loss\n", "\n", " @staticmethod\n", " def grad(y, y_pred, t_mean, t_log_var):\n", " \"\"\"\n", " Compute the gradient of the VLB with regard to the network parameters.\n", "\n", " Parameters\n", " ----------\n", " y : :py:class:`ndarray ` of shape `(n_ex, N)`\n", " The original images.\n", " y_pred : :py:class:`ndarray ` of shape `(n_ex, N)`\n", " The VAE reconstruction of the images.\n", " t_mean: :py:class:`ndarray ` of shape `(n_ex, T)`\n", " Mean of the variational distribution :math:`q(t | x)`.\n", " t_log_var: :py:class:`ndarray ` of shape `(n_ex, T)`\n", " Log of the variance vector of the variational distribution\n", " :math:`q(t | x)`.\n", "\n", " Returns\n", " -------\n", " dY_pred : :py:class:`ndarray ` of shape `(n_ex, N)`\n", " The gradient of the VLB with regard to `y_pred`.\n", " dLogVar : :py:class:`ndarray ` of shape `(n_ex, T)`\n", " The gradient of the VLB with regard to `t_log_var`.\n", " dMean : :py:class:`ndarray ` of shape `(n_ex, T)`\n", " The gradient of the VLB with regard to `t_mean`.\n", " \"\"\"\n", " N = y.shape[0]\n", " eps = np.finfo(float).eps\n", " y_pred = np.clip(y_pred, eps, 1 - eps)\n", "\n", " dY_pred = -y / (N * y_pred) - (y - 1) / (N - N * y_pred)\n", " dLogVar = (np.exp(t_log_var) - 1) / (2 * N)\n", " dMean = t_mean / N\n", " return dY_pred, dLogVar, dMean\n", "\n", "\n", "class WGAN_GPLoss(ObjectiveBase):\n", " def __init__(self, lambda_=10):\n", " \"\"\"\n", " The loss function for a Wasserstein GAN [*]_ [*]_ with gradient penalty.\n", "\n", " Notes\n", " -----\n", " Assuming an optimal critic, minimizing this quantity wrt. the generator\n", " parameters corresponds to minimizing the Wasserstein-1 (earth-mover)\n", " distance between the fake and real data distributions.\n", "\n", " The formula for the WGAN-GP critic loss is\n", "\n", " .. math::\n", "\n", " \\\\text{WGANLoss}\n", " &= \\sum_{x \\in X_{real}} p(x) D(x)\n", " - \\sum_{x' \\in X_{fake}} p(x') D(x') \\\\\\\\\n", " \\\\text{WGANLossGP}\n", " &= \\\\text{WGANLoss} + \\lambda\n", " (||\\\\nabla_{X_{interp}} D(X_{interp})||_2 - 1)^2\n", "\n", " where\n", "\n", " .. math::\n", "\n", " X_{fake} &= \\\\text{Generator}(\\mathbf{z}) \\\\\\\\\n", " X_{interp} &= \\\\alpha X_{real} + (1 - \\\\alpha) X_{fake} \\\\\\\\\n", "\n", " and\n", "\n", " .. math::\n", "\n", " \\mathbf{z} &\\sim \\mathcal{N}(0, \\mathbb{1}) \\\\\\\\\n", " \\\\alpha &\\sim \\\\text{Uniform}(0, 1)\n", "\n", " References\n", " ----------\n", " .. [*] Gulrajani, I., Ahmed, F., Arjovsky, M., Dumoulin, V., &\n", " Courville, A. (2017) \"Improved training of Wasserstein GANs\"\n", " *Advances in Neural Information Processing Systems, 31*: 5769-5779.\n", " .. [*] Goodfellow, I. J, Abadie, P. A., Mirza, M., Xu, B., Farley, D.\n", " W., Ozair, S., Courville, A., & Bengio, Y. (2014) \"Generative\n", " adversarial nets\" *Advances in Neural Information Processing\n", " Systems, 27*: 2672-2680.\n", "\n", " Parameters\n", " ----------\n", " lambda_ : float\n", " The gradient penalty coefficient. Default is 10.\n", " \"\"\"\n", " self.lambda_ = lambda_\n", " super().__init__()\n", "\n", " def __call__(self, Y_fake, module, Y_real=None, gradInterp=None):\n", " \"\"\"\n", " Computes the generator and critic loss using the WGAN-GP value\n", " function.\n", "\n", " Parameters\n", " ----------\n", " Y_fake : :py:class:`ndarray ` of shape `(n_ex,)`\n", " The output of the critic for `X_fake`.\n", " module : {'C', 'G'}\n", " Whether to calculate the loss for the critic ('C') or the generator\n", " ('G'). If calculating loss for the critic, `Y_real` and\n", " `gradInterp` must not be None.\n", " Y_real : :py:class:`ndarray ` of shape `(n_ex,)`, or None\n", " The output of the critic for `X_real`. Default is None.\n", " gradInterp : :py:class:`ndarray ` of shape `(n_ex, n_feats)`, or None\n", " The gradient of the critic output for `X_interp` wrt. `X_interp`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " Depending on the setting for `module`, either the critic or\n", " generator loss, averaged over examples in the minibatch.\n", " \"\"\"\n", " return self.loss(Y_fake, module, Y_real=Y_real, gradInterp=gradInterp)\n", "\n", " def __str__(self):\n", " return \"WGANLossGP(lambda_={})\".format(self.lambda_)\n", "\n", " def loss(self, Y_fake, module, Y_real=None, gradInterp=None):\n", " \"\"\"\n", " Computes the generator and critic loss using the WGAN-GP value\n", " function.\n", "\n", " Parameters\n", " ----------\n", " Y_fake : :py:class:`ndarray ` of shape (n_ex,)\n", " The output of the critic for `X_fake`.\n", " module : {'C', 'G'}\n", " Whether to calculate the loss for the critic ('C') or the generator\n", " ('G'). If calculating loss for the critic, `Y_real` and\n", " `gradInterp` must not be None.\n", " Y_real : :py:class:`ndarray ` of shape `(n_ex,)` or None\n", " The output of the critic for `X_real`. Default is None.\n", " gradInterp : :py:class:`ndarray ` of shape `(n_ex, n_feats)` or None\n", " The gradient of the critic output for `X_interp` wrt. `X_interp`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " Depending on the setting for `module`, either the critic or\n", " generator loss, averaged over examples in the minibatch.\n", " \"\"\"\n", " # calc critic loss including gradient penalty\n", " if module == \"C\":\n", " X_interp_norm = np.linalg.norm(gradInterp, axis=1, keepdims=True)\n", " gradient_penalty = (X_interp_norm - 1) ** 2\n", " loss = (\n", " Y_fake.mean() - Y_real.mean() + self.lambda_ * gradient_penalty.mean()\n", " )\n", "\n", " # calc generator loss\n", " elif module == \"G\":\n", " loss = -Y_fake.mean()\n", "\n", " else:\n", " raise ValueError(\"Unrecognized module: {}\".format(module))\n", "\n", " return loss\n", "\n", " def grad(self, Y_fake, module, Y_real=None, gradInterp=None):\n", " \"\"\"\n", " Computes the gradient of the generator or critic loss with regard to\n", " its inputs.\n", "\n", " Parameters\n", " ----------\n", " Y_fake : :py:class:`ndarray ` of shape `(n_ex,)`\n", " The output of the critic for `X_fake`.\n", " module : {'C', 'G'}\n", " Whether to calculate the gradient for the critic loss ('C') or the\n", " generator loss ('G'). If calculating grads for the critic, `Y_real`\n", " and `gradInterp` must not be None.\n", " Y_real : :py:class:`ndarray ` of shape `(n_ex,)` or None\n", " The output of the critic for `X_real`. Default is None.\n", " gradInterp : :py:class:`ndarray ` of shape `(n_ex, n_feats)` or None\n", " The gradient of the critic output on `X_interp` wrt. `X_interp`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " grads : tuple\n", " If `module` == 'C', returns a 3-tuple containing the gradient of\n", " the critic loss with regard to (`Y_fake`, `Y_real`, `gradInterp`).\n", " If `module` == 'G', returns the gradient of the generator with\n", " regard to `Y_fake`.\n", " \"\"\"\n", " eps = np.finfo(float).eps\n", " n_ex_fake = Y_fake.shape[0]\n", "\n", " # calc gradient of the critic loss\n", " if module == \"C\":\n", " n_ex_real = Y_real.shape[0]\n", "\n", " dY_fake = -1 / n_ex_fake * np.ones_like(Y_fake)\n", " dY_real = 1 / n_ex_real * np.ones_like(Y_real)\n", "\n", " # differentiate through gradient penalty\n", " X_interp_norm = np.linalg.norm(gradInterp, axis=1, keepdims=True) + eps\n", "\n", " dGradInterp = (\n", " (2 / n_ex_fake)\n", " * self.lambda_\n", " * (X_interp_norm - 1)\n", " * (gradInterp / X_interp_norm)\n", " )\n", " grad = (dY_fake, dY_real, dGradInterp)\n", "\n", " # calc gradient of the generator loss\n", " elif module == \"G\":\n", " grad = -1 / n_ex_fake * np.ones_like(Y_fake)\n", "\n", " else:\n", " raise ValueError(\"Unrecognized module: {}\".format(module))\n", " return grad\n", "\n", "\n", "class NCELoss(ObjectiveBase):\n", " \"\"\"\n", " \"\"\"\n", "\n", " def __init__(\n", " self,\n", " n_classes,\n", " noise_sampler,\n", " num_negative_samples,\n", " optimizer=None,\n", " init=\"glorot_uniform\",\n", " subtract_log_label_prob=True,\n", " ):\n", " \"\"\"\n", " A noise contrastive estimation (NCE) loss function.\n", "\n", " Notes\n", " -----\n", " Noise contrastive estimation is a candidate sampling method often\n", " used to reduce the computational challenge of training a softmax\n", " layer on problems with a large number of output classes. It proceeds by\n", " training a logistic regression model to discriminate between samples\n", " from the true data distribution and samples from an artificial noise\n", " distribution.\n", "\n", " It can be shown that as the ratio of negative samples to data samples\n", " goes to infinity, the gradient of the NCE loss converges to the\n", " original softmax gradient.\n", "\n", " For input data **X**, target labels `targets`, loss parameters **W** and\n", " **b**, and noise samples `noise` sampled from the noise distribution `Q`,\n", " the NCE loss is\n", "\n", " .. math::\n", "\n", " \\\\text{NCE}(X, targets) =\n", " \\\\text{cross_entropy}(\\mathbf{y}_{targets}, \\hat{\\mathbf{y}}_{targets}) +\n", " \\\\text{cross_entropy}(\\mathbf{y}_{noise}, \\hat{\\mathbf{y}}_{noise})\n", "\n", " where\n", "\n", " .. math::\n", "\n", " \\hat{\\mathbf{y}}_{targets}\n", " &= \\sigma(\\mathbf{W}[targets] \\mathbf{X} + \\mathbf{b}[targets] - \\log Q(targets)) \\\\\\\\\n", " \\hat{\\mathbf{y}}_{noise}\n", " &= \\sigma(\\mathbf{W}[noise] \\mathbf{X} + \\mathbf{b}[noise] - \\log Q(noise))\n", "\n", " In the above equations, :math:`\\sigma` is the logistic sigmoid\n", " function, and :math:`Q(x)` corresponds to the probability of the values\n", " in `x` under `Q`.\n", "\n", " References\n", " ----------\n", " .. [1] Gutmann, M. & Hyvarinen, A. (2010). Noise-contrastive\n", " estimation: A new estimation principle for unnormalized statistical\n", " models. *AISTATS, 13*: 297-304.\n", " .. [2] Minh, A. & Teh, Y. W. (2012). A fast and simple algorithm for\n", " training neural probabilistic language models. *ICML, 29*: 1751-1758.\n", "\n", " Parameters\n", " ----------\n", " n_classes : int\n", " The total number of output classes in the model.\n", " noise_sampler : :class:`~numpy_ml.utils.data_structures.DiscreteSampler` instance\n", " The negative sampler. Defines a distribution over all classes in\n", " the dataset.\n", " num_negative_samples : int\n", " The number of negative samples to draw for each target / batch of\n", " targets.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is 'glorot_uniform'.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", " subtract_log_label_prob : bool\n", " Whether to subtract the log of the probability of each label under\n", " the noise distribution from its respective logit. Set to False for\n", " negative sampling, True for NCE. Default is True.\n", "\n", " Attributes\n", " ----------\n", " gradients : dict\n", " The accumulated parameter gradients.\n", " parameters: dict\n", " The loss parameter values.\n", " hyperparameters: dict\n", " The loss hyperparameter values.\n", " derived_variables: dict\n", " Useful intermediate values computed during the loss computation.\n", " \"\"\"\n", " super().__init__()\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.trainable = True\n", " self.n_classes = n_classes\n", " self.noise_sampler = noise_sampler\n", " self.num_negative_samples = num_negative_samples\n", " self.act_fn = ActivationInitializer(\"Sigmoid\")()\n", " self.optimizer = OptimizerInitializer(optimizer)()\n", " self.subtract_log_label_prob = subtract_log_label_prob\n", "\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " self.X = []\n", " b = np.zeros((1, self.n_classes))\n", " W = init_weights((self.n_classes, self.n_in))\n", "\n", " self.parameters = {\"W\": W, \"b\": b}\n", "\n", " self.gradients = {\"W\": np.zeros_like(W), \"b\": np.zeros_like(b)}\n", "\n", " self.derived_variables = {\n", " \"y_pred\": [],\n", " \"target\": [],\n", " \"true_w\": [],\n", " \"true_b\": [],\n", " \"sampled_b\": [],\n", " \"sampled_w\": [],\n", " \"out_labels\": [],\n", " \"target_logits\": [],\n", " \"noise_samples\": [],\n", " \"noise_logits\": [],\n", " }\n", "\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " return {\n", " \"id\": \"NCELoss\",\n", " \"n_in\": self.n_in,\n", " \"init\": self.init,\n", " \"n_classes\": self.n_classes,\n", " \"noise_sampler\": self.noise_sampler,\n", " \"num_negative_samples\": self.num_negative_samples,\n", " \"subtract_log_label_prob\": self.subtract_log_label_prob,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def __call__(self, X, target, neg_samples=None, retain_derived=True):\n", " return self.loss(X, target, neg_samples, retain_derived)\n", "\n", " def __str__(self):\n", " keys = [\n", " \"{}={}\".format(k, v)\n", " for k, v in self.hyperparameters.items()\n", " if k not in [\"id\", \"optimizer\"]\n", " ] + [\"optimizer={}\".format(self.optimizer)]\n", " return \"NCELoss({})\".format(\", \".join(keys))\n", "\n", " def freeze(self):\n", " \"\"\"\n", " Freeze the loss parameters at their current values so they can no\n", " longer be updated.\n", " \"\"\"\n", " self.trainable = False\n", "\n", " def unfreeze(self):\n", " \"\"\"Unfreeze the layer parameters so they can be updated.\"\"\"\n", " self.trainable = True\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the layer's derived variables and gradients.\"\"\"\n", " assert self.trainable, \"NCELoss is frozen\"\n", " self.X = []\n", " for k, v in self.derived_variables.items():\n", " self.derived_variables[k] = []\n", "\n", " for k, v in self.gradients.items():\n", " self.gradients[k] = np.zeros_like(v)\n", "\n", " def update(self, cur_loss=None):\n", " \"\"\"\n", " Update the loss parameters using the accrued gradients and optimizer.\n", " Flush all gradients once the update is complete.\n", " \"\"\"\n", " assert self.trainable, \"NCELoss is frozen\"\n", " self.optimizer.step()\n", " for k, v in self.gradients.items():\n", " if k in self.parameters:\n", " self.parameters[k] = self.optimizer(self.parameters[k], v, k, cur_loss)\n", " self.flush_gradients()\n", "\n", " def loss(self, X, target, neg_samples=None, retain_derived=True):\n", " \"\"\"\n", " Compute the NCE loss for a collection of inputs and associated targets.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_c, n_in)`\n", " Layer input. A minibatch of `n_ex` examples, where each example is\n", " an `n_c` by `n_in` matrix (e.g., the matrix of `n_c` context\n", " embeddings, each of dimensionality `n_in`, for a CBOW model).\n", " target : :py:class:`ndarray ` of shape `(n_ex,)`\n", " Integer indices of the target class(es) for each example in the\n", " minibatch (e.g., the target word id for an example in a CBOW model).\n", " neg_samples : :py:class:`ndarray ` of shape (`num_negative_samples`,) or None\n", " An optional array of negative samples to use during the loss\n", " calculation. These will be used instead of samples draw from\n", " ``self.noise_sampler``. Default is None.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through with regard to this input.\n", " Default is True.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " The NCE loss summed over the minibatch and samples.\n", " y_pred : :py:class:`ndarray ` of shape (`n_ex`, `n_c`)\n", " The network predictions for the conditional probability of each\n", " target given each context: entry (`i`, `j`) gives the predicted\n", " probability of target `i` under context vector `j`.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = X.shape[-1]\n", " self._init_params()\n", "\n", " loss, Z_target, Z_neg, y_pred, y_true, noise_samples = self._loss(\n", " X, target, neg_samples\n", " )\n", "\n", " # cache derived variables for gradient calculation\n", " if retain_derived:\n", " self.X.append(X)\n", "\n", " self.derived_variables[\"y_pred\"].append(y_pred)\n", " self.derived_variables[\"target\"].append(target)\n", " self.derived_variables[\"out_labels\"].append(y_true)\n", " self.derived_variables[\"target_logits\"].append(Z_target)\n", " self.derived_variables[\"noise_samples\"].append(noise_samples)\n", " self.derived_variables[\"noise_logits\"].append(Z_neg)\n", "\n", " return loss, np.squeeze(y_pred[..., :1], -1)\n", "\n", " def _loss(self, X, target, neg_samples):\n", " \"\"\"Actual computation of NCE loss\"\"\"\n", " fstr = \"X must have shape (n_ex, n_c, n_in), but got {} dims instead\"\n", " assert X.ndim == 3, fstr.format(X.ndim)\n", "\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " # sample negative samples from the noise distribution\n", " if neg_samples is None:\n", " neg_samples = self.noise_sampler(self.num_negative_samples)\n", " assert len(neg_samples) == self.num_negative_samples\n", "\n", " # get the probability of the negative sample class and the target\n", " # class under the noise distribution\n", " p_neg_samples = self.noise_sampler.probs[neg_samples]\n", " p_target = np.atleast_2d(self.noise_sampler.probs[target])\n", "\n", " # save the noise samples for debugging\n", " noise_samples = (neg_samples, p_target, p_neg_samples)\n", "\n", " # compute the logit for the negative samples and target\n", " Z_target = X @ W[target].T + b[0, target]\n", " Z_neg = X @ W[neg_samples].T + b[0, neg_samples]\n", "\n", " # subtract the log probability of each label under the noise dist\n", " if self.subtract_log_label_prob:\n", " n, m = Z_target.shape[0], Z_neg.shape[0]\n", " Z_target[range(n), ...] -= np.log(p_target)\n", " Z_neg[range(m), ...] -= np.log(p_neg_samples)\n", "\n", " # only retain the probability of the target under its associated\n", " # minibatch example\n", " aa, _, cc = Z_target.shape\n", " Z_target = Z_target[range(aa), :, range(cc)][..., None]\n", "\n", " # p_target = (n_ex, n_c, 1)\n", " # p_neg = (n_ex, n_c, n_samples)\n", " pred_p_target = self.act_fn(Z_target)\n", " pred_p_neg = self.act_fn(Z_neg)\n", "\n", " # if we're in evaluation mode, ignore the negative samples - just\n", " # return the binary cross entropy on the targets\n", " y_pred = pred_p_target\n", " if self.trainable:\n", " # (n_ex, n_c, 1 + n_samples) (target is first column)\n", " y_pred = np.concatenate((y_pred, pred_p_neg), axis=-1)\n", "\n", " n_targets = 1\n", " y_true = np.zeros_like(y_pred)\n", " y_true[..., :n_targets] = 1\n", "\n", " # binary cross entropy\n", " eps = np.finfo(float).eps\n", " np.clip(y_pred, eps, 1 - eps, y_pred)\n", " loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))\n", " return loss, Z_target, Z_neg, y_pred, y_true, noise_samples\n", "\n", " def grad(self, retain_grads=True, update_params=True):\n", " \"\"\"\n", " Compute the gradient of the NCE loss with regard to the inputs,\n", " weights, and biases.\n", "\n", " Parameters\n", " ----------\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", " update_params : bool\n", " Whether to perform a single step of gradient descent on the layer\n", " weights and bias using the calculated gradients. If `retain_grads`\n", " is False, this option is ignored and the parameter gradients are\n", " not updated. Default is True.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape (`n_ex`, `n_in`) or list of arrays\n", " The gradient of the loss with regard to the layer input(s) `X`.\n", " \"\"\"\n", " assert self.trainable, \"NCE loss is frozen\"\n", "\n", " dX = []\n", " for input_idx, x in enumerate(self.X):\n", " dx, dw, db = self._grad(x, input_idx)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", " self.gradients[\"b\"] += db\n", "\n", " dX = dX[0] if len(self.X) == 1 else dX\n", "\n", " if retain_grads and update_params:\n", " self.update()\n", "\n", " return dX\n", "\n", " def _grad(self, X, input_idx):\n", " \"\"\"Actual computation of gradient wrt. loss weights + input\"\"\"\n", " W, b = self.parameters[\"W\"], self.parameters[\"b\"]\n", "\n", " y_pred = self.derived_variables[\"y_pred\"][input_idx]\n", " target = self.derived_variables[\"target\"][input_idx]\n", " y_true = self.derived_variables[\"out_labels\"][input_idx]\n", " Z_neg = self.derived_variables[\"noise_logits\"][input_idx]\n", " Z_target = self.derived_variables[\"target_logits\"][input_idx]\n", " neg_samples = self.derived_variables[\"noise_samples\"][input_idx][0]\n", "\n", " # the number of target classes per minibatch example\n", " n_targets = 1\n", "\n", " # calculate the grad of the binary cross entropy wrt. the network\n", " # predictions\n", " preds, classes = y_pred.flatten(), y_true.flatten()\n", "\n", " dLdp_real = ((1 - classes) / (1 - preds)) - (classes / preds)\n", " dLdp_real = dLdp_real.reshape(*y_pred.shape)\n", "\n", " # partition the gradients into target and negative sample portions\n", " dLdy_pred_target = dLdp_real[..., :n_targets]\n", " dLdy_pred_neg = dLdp_real[..., n_targets:]\n", "\n", " # compute gradients of the loss wrt the data and noise logits\n", " dLdZ_target = dLdy_pred_target * self.act_fn.grad(Z_target)\n", " dLdZ_neg = dLdy_pred_neg * self.act_fn.grad(Z_neg)\n", "\n", " # compute param gradients on target + negative samples\n", " dB_neg = dLdZ_neg.sum(axis=(0, 1))\n", " dB_target = dLdZ_target.sum(axis=(1, 2))\n", "\n", " dW_neg = (dLdZ_neg.transpose(0, 2, 1) @ X).sum(axis=0)\n", " dW_target = (dLdZ_target.transpose(0, 2, 1) @ X).sum(axis=1)\n", "\n", " # TODO: can this be done with np.einsum instead?\n", " dX_target = np.vstack(\n", " [dLdZ_target[[ix]] @ W[[t]] for ix, t in enumerate(target)]\n", " )\n", " dX_neg = dLdZ_neg @ W[neg_samples]\n", "\n", " hits = list(set(target).intersection(set(neg_samples)))\n", " hit_ixs = [np.where(target == h)[0] for h in hits]\n", "\n", " # adjust param gradients if there's an accidental hit\n", " if len(hits) != 0:\n", " hit_ixs = np.concatenate(hit_ixs)\n", " target = np.delete(target, hit_ixs)\n", " dB_target = np.delete(dB_target, hit_ixs)\n", " dW_target = np.delete(dW_target, hit_ixs, 0)\n", "\n", " dX = dX_target + dX_neg\n", "\n", " # use np.add.at to ensure that repeated indices in the target (or\n", " # possibly in neg_samples if sampling is done with replacement) are\n", " # properly accounted for\n", " dB = np.zeros_like(b).flatten()\n", " np.add.at(dB, target, dB_target)\n", " np.add.at(dB, neg_samples, dB_neg)\n", " dB = dB.reshape(*b.shape)\n", "\n", " dW = np.zeros_like(W)\n", " np.add.at(dW, target, dW_target)\n", " np.add.at(dW, neg_samples, dW_neg)\n", "\n", " return dX, dW, dB\n"]} {"path": "numpy_ml/neural_nets/wrappers/__init__.py", "content": ["from .wrappers import *\n"]} {"path": "numpy_ml/neural_nets/wrappers/wrappers.py", "content": ["\"\"\"\n", "A collection of objects thats can wrap / otherwise modify arbitrary neural\n", "network layers.\n", "\"\"\"\n", "\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "\n", "class WrapperBase(ABC):\n", " def __init__(self, wrapped_layer):\n", " \"\"\"An abstract base class for all Wrapper instances\"\"\"\n", " self._base_layer = wrapped_layer\n", " if hasattr(wrapped_layer, \"_base_layer\"):\n", " self._base_layer = wrapped_layer._base_layer\n", " super().__init__()\n", "\n", " @abstractmethod\n", " def _init_wrapper_params(self):\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def forward(self, z, **kwargs):\n", " \"\"\"Overwritten by inherited class\"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def backward(self, out, **kwargs):\n", " \"\"\"Overwritten by inherited class\"\"\"\n", " raise NotImplementedError\n", "\n", " @property\n", " def trainable(self):\n", " \"\"\"Whether the base layer is frozen\"\"\"\n", " return self._base_layer.trainable\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the base layer parameters\"\"\"\n", " return self._base_layer.parameters\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the base layer's hyperparameters\"\"\"\n", " hp = self._base_layer.hyperparameters\n", " hpw = self._wrapper_hyperparameters\n", " if \"wrappers\" in hp:\n", " hp[\"wrappers\"].append(hpw)\n", " else:\n", " hp[\"wrappers\"] = [hpw]\n", " return hp\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"\n", " A dictionary of the intermediate values computed during layer\n", " training.\n", " \"\"\"\n", " dv = self._base_layer.derived_variables.copy()\n", " if \"wrappers\" in dv:\n", " dv[\"wrappers\"].append(self._wrapper_derived_variables)\n", " else:\n", " dv[\"wrappers\"] = [self._wrapper_derived_variables]\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"A dictionary of the current layer parameter gradients.\"\"\"\n", " return self._base_layer.gradients\n", "\n", " @property\n", " def act_fn(self):\n", " \"\"\"The activation function for the base layer.\"\"\"\n", " return self._base_layer.act_fn\n", "\n", " @property\n", " def X(self):\n", " \"\"\"The collection of layer inputs.\"\"\"\n", " return self._base_layer.X\n", "\n", " def _init_params(self):\n", " hp = self._wrapper_hyperparameters\n", " if \"wrappers\" in self._base_layer.hyperparameters:\n", " self._base_layer.hyperparameters[\"wrappers\"].append(hp)\n", " else:\n", " self._base_layer.hyperparameters[\"wrappers\"] = [hp]\n", "\n", " def freeze(self):\n", " \"\"\"\n", " Freeze the base layer's parameters at their current values so they can\n", " no longer be updated.\n", " \"\"\"\n", " self._base_layer.freeze()\n", "\n", " def unfreeze(self):\n", " \"\"\"Unfreeze the base layer's parameters so they can be updated.\"\"\"\n", " self._base_layer.freeze()\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the wrapper and base layer's derived variables and gradients.\"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " self._base_layer.flush_gradients()\n", "\n", " for k, v in self._wrapper_derived_variables.items():\n", " self._wrapper_derived_variables[k] = []\n", "\n", " def update(self, lr):\n", " \"\"\"\n", " Update the base layer's parameters using the accrued gradients and\n", " layer optimizer. Flush all gradients once the update is complete.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " self._base_layer.update(lr)\n", " self.flush_gradients()\n", "\n", " def _set_wrapper_params(self, pdict):\n", " for k, v in pdict.items():\n", " if k in self._wrapper_hyperparameters:\n", " self._wrapper_hyperparameters[k] = v\n", " return self\n", "\n", " def set_params(self, summary_dict):\n", " \"\"\"\n", " Set the base layer parameters from a dictionary of values.\n", "\n", " Parameters\n", " ----------\n", " summary_dict : dict\n", " A dictionary of layer parameters and hyperparameters. If a required\n", " parameter or hyperparameter is not included within `summary_dict`,\n", " this method will use the value in the current layer's\n", " :meth:`summary` method.\n", "\n", " Returns\n", " -------\n", " layer : :doc:`Layer ` object\n", " The newly-initialized layer.\n", " \"\"\"\n", " return self._base_layer.set_params(summary_dict)\n", "\n", " def summary(self):\n", " \"\"\"Return a dict of the layer parameters, hyperparameters, and ID.\"\"\"\n", " return {\n", " \"layer\": self.hyperparameters[\"layer\"],\n", " \"layer_wrappers\": [i[\"wrapper\"] for i in self.hyperparameters[\"wrappers\"]],\n", " \"parameters\": self.parameters,\n", " \"hyperparameters\": self.hyperparameters,\n", " }\n", "\n", "\n", "class Dropout(WrapperBase):\n", " def __init__(self, wrapped_layer, p):\n", " \"\"\"\n", " A dropout regularization wrapper.\n", "\n", " Notes\n", " -----\n", " During training, a dropout layer zeroes each element of the layer input\n", " with probability `p` and scales the activation by `1 / (1 - p)` (to reflect\n", " the fact that on average only `(1 - p) * N` units are active on any\n", " training pass). At test time, does not adjust elements of the input at\n", " all (ie., simply computes the identity function).\n", "\n", " Parameters\n", " ----------\n", " wrapped_layer : :doc:`Layer ` instance\n", " The layer to apply dropout to.\n", " p : float in [0, 1)\n", " The dropout propbability during training\n", " \"\"\"\n", " super().__init__(wrapped_layer)\n", " self.p = p\n", " self._init_wrapper_params()\n", " self._init_params()\n", "\n", " def _init_wrapper_params(self):\n", " self._wrapper_derived_variables = {\"dropout_mask\": []}\n", " self._wrapper_hyperparameters = {\"wrapper\": \"Dropout\", \"p\": self.p}\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output with dropout for a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\"\n", " scaler, mask = 1.0, np.ones(X.shape).astype(bool)\n", " if self.trainable:\n", " scaler = 1.0 / (1.0 - self.p)\n", " mask = np.random.rand(*X.shape) >= self.p\n", " X = mask * X\n", "\n", " if retain_derived:\n", " self._wrapper_derived_variables[\"dropout_mask\"].append(mask)\n", "\n", " return scaler * self._base_layer.forward(X, retain_derived)\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from the base layer's outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, n_out)` or list of arrays\n", " The gradient(s) of the loss wrt. the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in)` or list of arrays\n", " The gradient of the loss wrt. the layer input(s) `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " dLdy *= 1.0 / (1.0 - self.p)\n", " return self._base_layer.backward(dLdy, retain_grads)\n", "\n", "\n", "def init_wrappers(layer, wrappers_list):\n", " \"\"\"\n", " Initialize the layer wrappers in `wrapper_list` and return a wrapped\n", " `layer` object.\n", "\n", " Parameters\n", " ----------\n", " layer : :doc:`Layer ` instance\n", " The base layer object to apply the wrappers to.\n", " wrappers : list of dicts\n", " A list of parameter dictionaries for a the wrapper objects. The\n", " wrappers are initialized and applied to the the layer sequentially.\n", "\n", " Returns\n", " -------\n", " wrapped_layer : :class:`WrapperBase` instance\n", " The wrapped layer object\n", " \"\"\"\n", " for wr in wrappers_list:\n", " if wr[\"wrapper\"] == \"Dropout\":\n", " layer = Dropout(layer, 1)._set_wrapper_params(wr)\n", " else:\n", " raise NotImplementedError(\"{}\".format(wr[\"wrapper\"]))\n", " return layer\n"]} {"path": "numpy_ml/neural_nets/layers/__init__.py", "content": ["from .layers import *\n"]} {"path": "numpy_ml/neural_nets/layers/layers.py", "content": ["\"\"\"A collection of composable layer objects for building neural networks\"\"\"\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "from ..wrappers import init_wrappers, Dropout\n", "\n", "from ..initializers import (\n", " WeightInitializer,\n", " OptimizerInitializer,\n", " ActivationInitializer,\n", ")\n", "\n", "from ..utils import (\n", " pad1D,\n", " pad2D,\n", " conv1D,\n", " conv2D,\n", " im2col,\n", " col2im,\n", " dilate,\n", " deconv2D_naive,\n", " calc_pad_dims_2D,\n", ")\n", "\n", "\n", "class LayerBase(ABC):\n", " def __init__(self, optimizer=None):\n", " \"\"\"An abstract base class inherited by all neural network layers\"\"\"\n", " self.X = []\n", " self.act_fn = None\n", " self.trainable = True\n", " self.optimizer = OptimizerInitializer(optimizer)()\n", "\n", " self.gradients = {}\n", " self.parameters = {}\n", " self.derived_variables = {}\n", "\n", " super().__init__()\n", "\n", " @abstractmethod\n", " def _init_params(self, **kwargs):\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def forward(self, z, **kwargs):\n", " \"\"\"Perform a forward pass through the layer\"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def backward(self, out, **kwargs):\n", " \"\"\"Perform a backward pass through the layer\"\"\"\n", " raise NotImplementedError\n", "\n", " def freeze(self):\n", " \"\"\"\n", " Freeze the layer parameters at their current values so they can no\n", " longer be updated.\n", " \"\"\"\n", " self.trainable = False\n", "\n", " def unfreeze(self):\n", " \"\"\"Unfreeze the layer parameters so they can be updated.\"\"\"\n", " self.trainable = True\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the layer's derived variables and gradients.\"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " self.X = []\n", " for k, v in self.derived_variables.items():\n", " self.derived_variables[k] = []\n", "\n", " for k, v in self.gradients.items():\n", " self.gradients[k] = np.zeros_like(v)\n", "\n", " def update(self, cur_loss=None):\n", " \"\"\"\n", " Update the layer parameters using the accrued gradients and layer\n", " optimizer. Flush all gradients once the update is complete.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " self.optimizer.step()\n", " for k, v in self.gradients.items():\n", " if k in self.parameters:\n", " self.parameters[k] = self.optimizer(self.parameters[k], v, k, cur_loss)\n", " self.flush_gradients()\n", "\n", " def set_params(self, summary_dict):\n", " \"\"\"\n", " Set the layer parameters from a dictionary of values.\n", "\n", " Parameters\n", " ----------\n", " summary_dict : dict\n", " A dictionary of layer parameters and hyperparameters. If a required\n", " parameter or hyperparameter is not included within `summary_dict`,\n", " this method will use the value in the current layer's\n", " :meth:`summary` method.\n", "\n", " Returns\n", " -------\n", " layer : :doc:`Layer ` object\n", " The newly-initialized layer.\n", " \"\"\"\n", " layer, sd = self, summary_dict\n", "\n", " # collapse `parameters` and `hyperparameters` nested dicts into a single\n", " # merged dictionary\n", " flatten_keys = [\"parameters\", \"hyperparameters\"]\n", " for k in flatten_keys:\n", " if k in sd:\n", " entry = sd[k]\n", " sd.update(entry)\n", " del sd[k]\n", "\n", " for k, v in sd.items():\n", " if k in self.parameters:\n", " layer.parameters[k] = v\n", " if k in self.hyperparameters:\n", " if k == \"act_fn\":\n", " layer.act_fn = ActivationInitializer(v)()\n", " elif k == \"optimizer\":\n", " layer.optimizer = OptimizerInitializer(sd[k])()\n", " elif k == \"wrappers\":\n", " layer = init_wrappers(layer, sd[k])\n", " elif k not in [\"wrappers\", \"optimizer\"]:\n", " setattr(layer, k, v)\n", " return layer\n", "\n", " def summary(self):\n", " \"\"\"Return a dict of the layer parameters, hyperparameters, and ID.\"\"\"\n", " return {\n", " \"layer\": self.hyperparameters[\"layer\"],\n", " \"parameters\": self.parameters,\n", " \"hyperparameters\": self.hyperparameters,\n", " }\n", "\n", "\n", "class DotProductAttention(LayerBase):\n", " def __init__(self, scale=True, dropout_p=0, init=\"glorot_uniform\", optimizer=None):\n", " r\"\"\"\n", " A single \"attention head\" layer using a dot-product for the scoring function.\n", "\n", " Notes\n", " -----\n", " The equations for a dot product attention layer are:\n", "\n", " .. math::\n", "\n", " \\mathbf{Z} &= \\mathbf{K Q}^\\\\top \\ \\ \\ \\ &&\\text{if scale = False} \\\\\n", " &= \\mathbf{K Q}^\\top / \\sqrt{d_k} \\ \\ \\ \\ &&\\text{if scale = True} \\\\\n", " \\mathbf{Y} &= \\text{dropout}(\\text{softmax}(\\mathbf{Z})) \\mathbf{V}\n", "\n", " Parameters\n", " ----------\n", " scale : bool\n", " Whether to scale the the key-query dot product by the square root\n", " of the key/query vector dimensionality before applying the Softmax.\n", " This is useful, since the scale of dot product will otherwise\n", " increase as query / key dimensions grow. Default is True.\n", " dropout_p : float in [0, 1)\n", " The dropout propbability during training, applied to the output of\n", " the softmax. If 0, no dropout is applied. Default is 0.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " Unused.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None. Unused.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Unused\n", " parameters : dict\n", " Unused\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.scale = scale\n", " self.dropout_p = dropout_p\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self.softmax = Dropout(Softmax(), self.dropout_p)\n", " smdv = self.softmax.derived_variables\n", "\n", " self.gradients = {}\n", " self.parameters = {}\n", " self.derived_variables = {\n", " \"attention_weights\": [],\n", " \"dropout_mask\": smdv[\"wrappers\"][0][\"dropout_mask\"],\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"DotProductAttention\",\n", " \"init\": self.init,\n", " \"scale\": self.scale,\n", " \"dropout_p\": self.dropout_p,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def freeze(self):\n", " \"\"\"\n", " Freeze the layer parameters at their current values so they can no\n", " longer be updated.\n", " \"\"\"\n", " self.trainable = False\n", " self.softmax.freeze()\n", "\n", " def unfreeze(self):\n", " \"\"\"Unfreeze the layer parameters so they can be updated.\"\"\"\n", " self.trainable = True\n", " self.softmax.unfreeze()\n", "\n", " def forward(self, Q, K, V, retain_derived=True):\n", " r\"\"\"\n", " Compute the attention-weighted output of a collection of keys, values,\n", " and queries.\n", "\n", " Notes\n", " -----\n", " In the most abstract (ie., hand-wave-y) sense:\n", "\n", " - Query vectors ask questions\n", " - Key vectors advertise their relevancy to questions\n", " - Value vectors give possible answers to questions\n", " - The dot product between Key and Query vectors provides scores for\n", " each of the the `n_ex` different Value vectors\n", "\n", " For a single query and `n` key-value pairs, dot-product attention (with\n", " scaling) is::\n", "\n", " w0 = dropout(softmax( (query @ key[0]) / sqrt(d_k) ))\n", " w1 = dropout(softmax( (query @ key[1]) / sqrt(d_k) ))\n", " ...\n", " wn = dropout(softmax( (query @ key[n]) / sqrt(d_k) ))\n", "\n", " y = np.array([w0, ..., wn]) @ values\n", " (1 \u00d7 n_ex) (n_ex \u00d7 d_v)\n", "\n", " In words, keys and queries are combined via dot-product to produce a\n", " score, which is then passed through a softmax to produce a weight on\n", " each value vector in Values. We elementwise multiply each value vector\n", " by its weight, and then take the elementwise sum of each weighted value\n", " vector to get the :math:`1 \\times d_v` output for the current example.\n", "\n", " In vectorized form,\n", "\n", " .. math::\n", "\n", " \\mathbf{Y} = \\text{dropout}(\n", " \\text{softmax}(\\mathbf{KQ}^\\top / \\sqrt{d_k})\n", " ) \\mathbf{V}\n", "\n", " Parameters\n", " ----------\n", " Q : :py:class:`ndarray ` of shape `(n_ex, *, d_k)`\n", " A set of `n_ex` query vectors packed into a single matrix.\n", " Optional middle dimensions can be used to specify, e.g., the number\n", " of parallel attention heads.\n", " K : :py:class:`ndarray ` of shape `(n_ex, *, d_k)`\n", " A set of `n_ex` key vectors packed into a single matrix. Optional\n", " middle dimensions can be used to specify, e.g., the number of\n", " parallel attention heads.\n", " V : :py:class:`ndarray ` of shape `(n_ex, *, d_v)`\n", " A set of `n_ex` value vectors packed into a single matrix. Optional\n", " middle dimensions can be used to specify, e.g., the number of\n", " parallel attention heads.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, *, d_v)`\n", " The attention-weighted output values\n", " \"\"\"\n", " Y, weights = self._fwd(Q, K, V)\n", "\n", " if retain_derived:\n", " self.X.append((Q, K, V))\n", " self.derived_variables[\"attention_weights\"].append(weights)\n", "\n", " return Y\n", "\n", " def _fwd(self, Q, K, V):\n", " \"\"\"Actual computation of forward pass\"\"\"\n", " scale = 1 / np.sqrt(Q.shape[-1]) if self.scale else 1\n", " scores = Q @ K.swapaxes(-2, -1) * scale # attention scores\n", " weights = self.softmax.forward(scores) # attention weights\n", " Y = weights @ V\n", " return Y, weights\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " r\"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, *, d_v)`\n", " The gradient of the loss wrt. the layer output `Y`\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dQ : :py:class:`ndarray ` of shape `(n_ex, *, d_k)` or list of arrays\n", " The gradient of the loss wrt. the layer query matrix/matrices `Q`.\n", " dK : :py:class:`ndarray ` of shape `(n_ex, *, d_k)` or list of arrays\n", " The gradient of the loss wrt. the layer key matrix/matrices `K`.\n", " dV : :py:class:`ndarray ` of shape `(n_ex, *, d_v)` or list of arrays\n", " The gradient of the loss wrt. the layer value matrix/matrices `V`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dQ, dK, dV = [], [], []\n", " weights = self.derived_variables[\"attention_weights\"]\n", " for dy, (q, k, v), w in zip(dLdy, self.X, weights):\n", " dq, dk, dv = self._bwd(dy, q, k, v, w)\n", " dQ.append(dq)\n", " dK.append(dk)\n", " dV.append(dv)\n", "\n", " if len(self.X) == 1:\n", " dQ, dK, dV = dQ[0], dK[0], dV[0]\n", "\n", " return dQ, dK, dV\n", "\n", " def _bwd(self, dy, q, k, v, weights):\n", " \"\"\"Actual computation of the gradient of the loss wrt. q, k, and v\"\"\"\n", " d_k = k.shape[-1]\n", " scale = 1 / np.sqrt(d_k) if self.scale else 1\n", "\n", " dV = weights.swapaxes(-2, -1) @ dy\n", " dWeights = dy @ v.swapaxes(-2, -1)\n", " dScores = self.softmax.backward(dWeights)\n", " dQ = dScores @ k * scale\n", " dK = dScores.swapaxes(-2, -1) @ q * scale\n", " return dQ, dK, dV\n", "\n", "\n", "class RBM(LayerBase):\n", " def __init__(self, n_out, K=1, init=\"glorot_uniform\", optimizer=None):\n", " \"\"\"\n", " A Restricted Boltzmann machine with Bernoulli visible and hidden units.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The number of output dimensions/units.\n", " K : int\n", " The number of contrastive divergence steps to run before computing\n", " a single gradient update. Default is 1.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Unused\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.K = K # CD-K\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.is_initialized = False\n", " self.act_fn_V = ActivationInitializer(\"Sigmoid\")()\n", " self.act_fn_H = ActivationInitializer(\"Sigmoid\")()\n", " self.parameters = {\"W\": None, \"b_in\": None, \"b_out\": None}\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn_V), mode=self.init)\n", "\n", " b_in = np.zeros((1, self.n_in))\n", " b_out = np.zeros((1, self.n_out))\n", " W = init_weights((self.n_in, self.n_out))\n", "\n", " self.parameters = {\"W\": W, \"b_in\": b_in, \"b_out\": b_out}\n", "\n", " self.gradients = {\n", " \"W\": np.zeros_like(W),\n", " \"b_in\": np.zeros_like(b_in),\n", " \"b_out\": np.zeros_like(b_out),\n", " }\n", "\n", " self.derived_variables = {\n", " \"V\": None,\n", " \"p_H\": None,\n", " \"p_V_prime\": None,\n", " \"p_H_prime\": None,\n", " \"positive_grad\": None,\n", " \"negative_grad\": None,\n", " }\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"RBM\",\n", " \"K\": self.K,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"init\": self.init,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameterse,\n", " },\n", " }\n", "\n", " def CD_update(self, X):\n", " \"\"\"\n", " Perform a single contrastive divergence-`k` training update using the\n", " visible inputs `X` as a starting point for the Gibbs sampler.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples. Each feature in X should ideally be\n", " binary-valued, although it is possible to also train on real-valued\n", " features ranging between (0, 1) (e.g., grayscale images).\n", " \"\"\"\n", " self.forward(X)\n", " self.backward()\n", "\n", " def forward(self, V, K=None, retain_derived=True):\n", " \"\"\"\n", " Perform the CD-`k` \"forward pass\" of visible inputs into hidden units\n", " and back.\n", "\n", " Notes\n", " -----\n", " This implementation follows [1]_'s recommendations for the RBM forward\n", " pass:\n", "\n", " - Use real-valued probabilities for both the data and the visible\n", " unit reconstructions.\n", " - Only the final update of the hidden units should use the actual\n", " probabilities -- all others should be sampled binary states.\n", " - When collecting the pairwise statistics for learning weights or\n", " the individual statistics for learning biases, use the\n", " probabilities, not the binary states.\n", "\n", " References\n", " ----------\n", " .. [1] Hinton, G. (2010). \"A practical guide to training restricted\n", " Boltzmann machines\". *UTML TR 2010-003*\n", "\n", " Parameters\n", " ----------\n", " V : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Visible input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples. Each feature in V should ideally be\n", " binary-valued, although it is possible to also train on real-valued\n", " features ranging between (0, 1) (e.g., grayscale images).\n", " K : int\n", " The number of steps of contrastive divergence steps to run before\n", " computing the gradient update. If None, use ``self.K``. Default is\n", " None.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = V.shape[1]\n", " self._init_params()\n", "\n", " # override self.K if necessary\n", " K = self.K if K is None else K\n", "\n", " W = self.parameters[\"W\"]\n", " b_in = self.parameters[\"b_in\"]\n", " b_out = self.parameters[\"b_out\"]\n", "\n", " # compute hidden unit probabilities\n", " Z_H = V @ W + b_out\n", " p_H = self.act_fn_H.fn(Z_H)\n", "\n", " # sample hidden states (stochastic binary values)\n", " H = np.random.rand(*p_H.shape) <= p_H\n", " H = H.astype(float)\n", "\n", " # always use probabilities when computing gradients\n", " positive_grad = V.T @ p_H\n", "\n", " # perform CD-k\n", " # TODO: use persistent CD-k\n", " # https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf\n", " H_prime = H.copy()\n", " for k in range(K):\n", " # resample v' given h (H_prime is binary for all but final step)\n", " Z_V_prime = H_prime @ W.T + b_in\n", " p_V_prime = self.act_fn_V.fn(Z_V_prime)\n", "\n", " # don't resample visual units - always use raw probabilities!\n", " V_prime = p_V_prime\n", "\n", " # compute p(h' | v')\n", " Z_H_prime = V_prime @ W + b_out\n", " p_H_prime = self.act_fn_H.fn(Z_H_prime)\n", "\n", " # if this is the final iteration of CD, keep hidden state\n", " # probabilities (don't sample)\n", " H_prime = p_H_prime\n", " if k != self.K - 1:\n", " H_prime = np.random.rand(*p_H_prime.shape) <= p_H_prime\n", " H_prime = H_prime.astype(float)\n", "\n", " negative_grad = p_V_prime.T @ p_H_prime\n", "\n", " if retain_derived:\n", " self.derived_variables[\"V\"] = V\n", " self.derived_variables[\"p_H\"] = p_H\n", " self.derived_variables[\"p_V_prime\"] = p_V_prime\n", " self.derived_variables[\"p_H_prime\"] = p_H_prime\n", " self.derived_variables[\"positive_grad\"] = positive_grad\n", " self.derived_variables[\"negative_grad\"] = negative_grad\n", "\n", " def backward(self, retain_grads=True, *args):\n", " \"\"\"\n", " Perform a gradient update on the layer parameters via the contrastive\n", " divergence equations.\n", "\n", " Parameters\n", " ----------\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", " \"\"\"\n", " V = self.derived_variables[\"V\"]\n", " p_H = self.derived_variables[\"p_H\"]\n", " p_V_prime = self.derived_variables[\"p_V_prime\"]\n", " p_H_prime = self.derived_variables[\"p_H_prime\"]\n", " positive_grad = self.derived_variables[\"positive_grad\"]\n", " negative_grad = self.derived_variables[\"negative_grad\"]\n", "\n", " if retain_grads:\n", " self.gradients[\"b_in\"] = V - p_V_prime\n", " self.gradients[\"b_out\"] = p_H - p_H_prime\n", " self.gradients[\"W\"] = positive_grad - negative_grad\n", "\n", " def reconstruct(self, X, n_steps=10, return_prob=False):\n", " \"\"\"\n", " Reconstruct an input `X` by running the trained Gibbs sampler for\n", " `n_steps`-worth of CD-`k`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples. Each feature in `X` should ideally be\n", " binary-valued, although it is possible to also train on real-valued\n", " features ranging between (0, 1) (e.g., grayscale images). If `X` has\n", " missing values, it may be sufficient to mark them with random\n", " entries and allow the reconstruction to impute them.\n", " n_steps : int\n", " The number of Gibbs sampling steps to perform when generating the\n", " reconstruction. Default is 10.\n", " return_prob : bool\n", " Whether to return the real-valued feature probabilities for the\n", " reconstruction or the binary samples. Default is False.\n", "\n", " Returns\n", " -------\n", " V : :py:class:`ndarray ` of shape `(n_ex, in_ch)`\n", " The reconstruction (or feature probabilities if `return_prob` is\n", " true) of the visual input `X` after running the Gibbs sampler for\n", " `n_steps`.\n", " \"\"\"\n", " self.forward(X, K=n_steps)\n", " p_V_prime = self.derived_variables[\"p_V_prime\"]\n", "\n", " # ignore the gradients produced during this reconstruction\n", " self.flush_gradients()\n", "\n", " # sample V_prime reconstruction if return_prob is False\n", " V = p_V_prime\n", " if not return_prob:\n", " V = (np.random.rand(*p_V_prime.shape) <= p_V_prime).astype(float)\n", " return V\n", "\n", "\n", "#######################################################################\n", "# Layer Ops #\n", "#######################################################################\n", "\n", "\n", "class Add(LayerBase):\n", " def __init__(self, act_fn=None, optimizer=None):\n", " \"\"\"\n", " An \"addition\" layer that returns the sum of its inputs, passed through\n", " an optional nonlinearity.\n", "\n", " Parameters\n", " ----------\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The element-wise output nonlinearity used in computing the final\n", " output. If None, use the identity function :math:`f(x) = x`.\n", " Default is None.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Unused\n", " parameters : dict\n", " Unused\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self.gradients = {}\n", " self.parameters = {}\n", " self.derived_variables = {\"sum\": []}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Sum\",\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " r\"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : list of length `n_inputs`\n", " A list of tensors, all of the same shape.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, *)`\n", " The sum over the `n_ex` examples.\n", " \"\"\"\n", " out = X[0].copy()\n", " for i in range(1, len(X)):\n", " out += X[i]\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"sum\"].append(out)\n", " return self.act_fn(out)\n", "\n", " def backward(self, dLdY, retain_grads=True):\n", " r\"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, *)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : list of length `n_inputs`\n", " The gradient of the loss wrt. each input in `X`.\n", " \"\"\"\n", " if not isinstance(dLdY, list):\n", " dLdY = [dLdY]\n", "\n", " X = self.X\n", " _sum = self.derived_variables[\"sum\"]\n", " grads = [self._bwd(dy, x, ss) for dy, x, ss in zip(dLdY, X, _sum)]\n", " return grads[0] if len(X) == 1 else grads\n", "\n", " def _bwd(self, dLdY, X, _sum):\n", " \"\"\"Actual computation of gradient of the loss wrt. each input\"\"\"\n", " grads = [dLdY * self.act_fn.grad(_sum) for _ in X]\n", " return grads\n", "\n", "\n", "class Multiply(LayerBase):\n", " def __init__(self, act_fn=None, optimizer=None):\n", " \"\"\"\n", " A multiplication layer that returns the *elementwise* product of its\n", " inputs, passed through an optional nonlinearity.\n", "\n", " Parameters\n", " ----------\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The element-wise output nonlinearity used in computing the final\n", " output. If None, use the identity function :math:`f(x) = x`.\n", " Default is None.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Unused\n", " parameters : dict\n", " Unused\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self.gradients = {}\n", " self.parameters = {}\n", " self.derived_variables = {\"product\": []}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Multiply\",\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " r\"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : list of length `n_inputs`\n", " A list of tensors, all of the same shape.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, *)`\n", " The product over the `n_ex` examples.\n", " \"\"\" # noqa: E501\n", " out = X[0].copy()\n", " for i in range(1, len(X)):\n", " out *= X[i]\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"product\"].append(out)\n", " return self.act_fn(out)\n", "\n", " def backward(self, dLdY, retain_grads=True):\n", " r\"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, *)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : list of length `n_inputs`\n", " The gradient of the loss wrt. each input in `X`.\n", " \"\"\"\n", " if not isinstance(dLdY, list):\n", " dLdY = [dLdY]\n", "\n", " X = self.X\n", " _prod = self.derived_variables[\"product\"]\n", " grads = [self._bwd(dy, x, pr) for dy, x, pr in zip(dLdY, X, _prod)]\n", " return grads[0] if len(X) == 1 else grads\n", "\n", " def _bwd(self, dLdY, X, prod):\n", " \"\"\"Actual computation of gradient of loss wrt. each input\"\"\"\n", " grads = [dLdY * self.act_fn.grad(prod)] * len(X)\n", " for i, x in enumerate(X):\n", " grads = [g * x if j != i else g for j, g in enumerate(grads)]\n", " return grads\n", "\n", "\n", "class Flatten(LayerBase):\n", " def __init__(self, keep_dim=\"first\", optimizer=None):\n", " \"\"\"\n", " Flatten a multidimensional input into a 2D matrix.\n", "\n", " Parameters\n", " ----------\n", " keep_dim : {'first', 'last', -1}\n", " The dimension of the original input to retain. Typically used for\n", " retaining the minibatch dimension.. If -1, flatten all dimensions.\n", " Default is 'first'.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Unused\n", " gradients : dict\n", " Unused\n", " parameters : dict\n", " Unused\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.keep_dim = keep_dim\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self.gradients = {}\n", " self.parameters = {}\n", " self.derived_variables = {\"in_dims\": []}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Flatten\",\n", " \"keep_dim\": self.keep_dim,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " r\"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray `\n", " Input volume to flatten.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(*out_dims)`\n", " Flattened output. If `keep_dim` is `'first'`, `X` is reshaped to\n", " ``(X.shape[0], -1)``, otherwise ``(-1, X.shape[0])``.\n", " \"\"\"\n", " if retain_derived:\n", " self.derived_variables[\"in_dims\"].append(X.shape)\n", " if self.keep_dim == -1:\n", " return X.flatten().reshape(1, -1)\n", " rs = (X.shape[0], -1) if self.keep_dim == \"first\" else (-1, X.shape[-1])\n", " return X.reshape(*rs)\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " r\"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(*out_dims)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(*in_dims)` or list of arrays\n", " The gradient of the loss wrt. the layer input(s) `X`.\n", " \"\"\" # noqa: E501\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", " in_dims = self.derived_variables[\"in_dims\"]\n", " out = [dy.reshape(*dims) for dy, dims in zip(dLdy, in_dims)]\n", " return out[0] if len(dLdy) == 1 else out\n", "\n", "\n", "#######################################################################\n", "# Normalization Layers #\n", "#######################################################################\n", "\n", "\n", "class BatchNorm2D(LayerBase):\n", " def __init__(self, momentum=0.9, epsilon=1e-5, optimizer=None):\n", " \"\"\"\n", " A batch normalization layer for two-dimensional inputs with an\n", " additional channel dimension.\n", "\n", " Notes\n", " -----\n", " BatchNorm is an attempt address the problem of internal covariate\n", " shift (ICS) during training by normalizing layer inputs.\n", "\n", " ICS refers to the change in the distribution of layer inputs during\n", " training as a result of the changing parameters of the previous\n", " layer(s). ICS can make it difficult to train models with saturating\n", " nonlinearities, and in general can slow training by requiring a lower\n", " learning rate.\n", "\n", " Equations [train]::\n", "\n", " Y = scaler * norm(X) + intercept\n", " norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)\n", "\n", " Equations [test]::\n", "\n", " Y = scaler * running_norm(X) + intercept\n", " running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)\n", "\n", " In contrast to :class:`LayerNorm2D`, the BatchNorm layer calculates\n", " the mean and var across the *batch* rather than the output features.\n", " This has two disadvantages:\n", "\n", " 1. It is highly affected by batch size: smaller mini-batch sizes\n", " increase the variance of the estimates for the global mean and\n", " variance.\n", "\n", " 2. It is difficult to apply in RNNs -- one must fit a separate\n", " BatchNorm layer for *each* time-step.\n", "\n", " Parameters\n", " ----------\n", " momentum : float\n", " The momentum term for the running mean/running std calculations.\n", " The closer this is to 1, the less weight will be given to the\n", " mean/std of the current batch (i.e., higher smoothing). Default is\n", " 0.9.\n", " epsilon : float\n", " A small smoothing constant to use during computation of ``norm(X)``\n", " to avoid divide-by-zero errors. Default is 1e-5.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.in_ch = None\n", " self.out_ch = None\n", " self.epsilon = epsilon\n", " self.momentum = momentum\n", " self.parameters = {\n", " \"scaler\": None,\n", " \"intercept\": None,\n", " \"running_var\": None,\n", " \"running_mean\": None,\n", " }\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " scaler = np.random.rand(self.in_ch)\n", " intercept = np.zeros(self.in_ch)\n", "\n", " # init running mean and std at 0 and 1, respectively\n", " running_mean = np.zeros(self.in_ch)\n", " running_var = np.ones(self.in_ch)\n", "\n", " self.parameters = {\n", " \"scaler\": scaler,\n", " \"intercept\": intercept,\n", " \"running_var\": running_var,\n", " \"running_mean\": running_mean,\n", " }\n", "\n", " self.gradients = {\n", " \"scaler\": np.zeros_like(scaler),\n", " \"intercept\": np.zeros_like(intercept),\n", " }\n", "\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"BatchNorm2D\",\n", " \"act_fn\": None,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"epsilon\": self.epsilon,\n", " \"momentum\": self.momentum,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def reset_running_stats(self):\n", " \"\"\"Reset the running mean and variance estimates to 0 and 1.\"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " self.parameters[\"running_mean\"] = np.zeros(self.in_ch)\n", " self.parameters[\"running_var\"] = np.ones(self.in_ch)\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Notes\n", " -----\n", " Equations [train]::\n", "\n", " Y = scaler * norm(X) + intercept\n", " norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)\n", "\n", " Equations [test]::\n", "\n", " Y = scaler * running_norm(X) + intercept\n", " running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)\n", "\n", " In contrast to :class:`LayerNorm2D`, the BatchNorm layer calculates the\n", " mean and var across the *batch* rather than the output features.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume containing the `in_rows` x `in_cols`-dimensional\n", " features for a minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to use the current intput to adjust the running mean and\n", " running_var computations. Setting this to False is the same as\n", " freezing the layer for the current input. Default is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\" # noqa: E501\n", " if not self.is_initialized:\n", " self.in_ch = self.out_ch = X.shape[3]\n", " self._init_params()\n", "\n", " ep = self.hyperparameters[\"epsilon\"]\n", " mm = self.hyperparameters[\"momentum\"]\n", " rm = self.parameters[\"running_mean\"]\n", " rv = self.parameters[\"running_var\"]\n", "\n", " scaler = self.parameters[\"scaler\"]\n", " intercept = self.parameters[\"intercept\"]\n", "\n", " # if the layer is frozen, use our running mean/std values rather\n", " # than the mean/std values for the new batch\n", " X_mean = self.parameters[\"running_mean\"]\n", " X_var = self.parameters[\"running_var\"]\n", "\n", " if self.trainable and retain_derived:\n", " X_mean, X_var = X.mean(axis=(0, 1, 2)), X.var(axis=(0, 1, 2)) # , ddof=1)\n", " self.parameters[\"running_mean\"] = mm * rm + (1.0 - mm) * X_mean\n", " self.parameters[\"running_var\"] = mm * rv + (1.0 - mm) * X_var\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", "\n", " N = (X - X_mean) / np.sqrt(X_var + ep)\n", " y = scaler * N + intercept\n", " return y\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx, dScaler, dIntercept = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"scaler\"] += dScaler\n", " self.gradients[\"intercept\"] += dIntercept\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"Computation of gradient of loss wrt. X, scaler, and intercept\"\"\"\n", " scaler = self.parameters[\"scaler\"]\n", " ep = self.hyperparameters[\"epsilon\"]\n", "\n", " # reshape to 2D, retaining channel dim\n", " X_shape = X.shape\n", " X = np.reshape(X, (-1, X.shape[3]))\n", " dLdy = np.reshape(dLdy, (-1, dLdy.shape[3]))\n", "\n", " # apply 1D batchnorm backward pass on reshaped array\n", " n_ex, in_ch = X.shape\n", " X_mean, X_var = X.mean(axis=0), X.var(axis=0) # , ddof=1)\n", "\n", " N = (X - X_mean) / np.sqrt(X_var + ep)\n", " dIntercept = dLdy.sum(axis=0)\n", " dScaler = np.sum(dLdy * N, axis=0)\n", "\n", " dN = dLdy * scaler\n", " dX = (n_ex * dN - dN.sum(axis=0) - N * (dN * N).sum(axis=0)) / (\n", " n_ex * np.sqrt(X_var + ep)\n", " )\n", "\n", " return np.reshape(dX, X_shape), dScaler, dIntercept\n", "\n", "\n", "class BatchNorm1D(LayerBase):\n", " def __init__(self, momentum=0.9, epsilon=1e-5, optimizer=None):\n", " \"\"\"\n", " A batch normalization layer for 1D inputs.\n", "\n", " Notes\n", " -----\n", " BatchNorm is an attempt address the problem of internal covariate\n", " shift (ICS) during training by normalizing layer inputs.\n", "\n", " ICS refers to the change in the distribution of layer inputs during\n", " training as a result of the changing parameters of the previous\n", " layer(s). ICS can make it difficult to train models with saturating\n", " nonlinearities, and in general can slow training by requiring a lower\n", " learning rate.\n", "\n", " Equations [train]::\n", "\n", " Y = scaler * norm(X) + intercept\n", " norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)\n", "\n", " Equations [test]::\n", "\n", " Y = scaler * running_norm(X) + intercept\n", " running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)\n", "\n", " In contrast to :class:`LayerNorm1D`, the BatchNorm layer calculates\n", " the mean and var across the *batch* rather than the output features.\n", " This has two disadvantages:\n", "\n", " 1. It is highly affected by batch size: smaller mini-batch sizes\n", " increase the variance of the estimates for the global mean and\n", " variance.\n", "\n", " 2. It is difficult to apply in RNNs -- one must fit a separate\n", " BatchNorm layer for *each* time-step.\n", "\n", " Parameters\n", " ----------\n", " momentum : float\n", " The momentum term for the running mean/running std calculations.\n", " The closer this is to 1, the less weight will be given to the\n", " mean/std of the current batch (i.e., higher smoothing). Default is\n", " 0.9.\n", " epsilon : float\n", " A small smoothing constant to use during computation of ``norm(X)``\n", " to avoid divide-by-zero errors. Default is 1e-5.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.n_in = None\n", " self.n_out = None\n", " self.epsilon = epsilon\n", " self.momentum = momentum\n", " self.parameters = {\n", " \"scaler\": None,\n", " \"intercept\": None,\n", " \"running_var\": None,\n", " \"running_mean\": None,\n", " }\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " scaler = np.random.rand(self.n_in)\n", " intercept = np.zeros(self.n_in)\n", "\n", " # init running mean and std at 0 and 1, respectively\n", " running_mean = np.zeros(self.n_in)\n", " running_var = np.ones(self.n_in)\n", "\n", " self.parameters = {\n", " \"scaler\": scaler,\n", " \"intercept\": intercept,\n", " \"running_mean\": running_mean,\n", " \"running_var\": running_var,\n", " }\n", "\n", " self.gradients = {\n", " \"scaler\": np.zeros_like(scaler),\n", " \"intercept\": np.zeros_like(intercept),\n", " }\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"BatchNorm1D\",\n", " \"act_fn\": None,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"epsilon\": self.epsilon,\n", " \"momentum\": self.momentum,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def reset_running_stats(self):\n", " \"\"\"Reset the running mean and variance estimates to 0 and 1.\"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " self.parameters[\"running_mean\"] = np.zeros(self.n_in)\n", " self.parameters[\"running_var\"] = np.ones(self.n_in)\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to use the current intput to adjust the running mean and\n", " running_var computations. Setting this to True is the same as\n", " freezing the layer for the current input. Default is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer output for each of the `n_ex` examples\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = self.n_out = X.shape[1]\n", " self._init_params()\n", "\n", " ep = self.hyperparameters[\"epsilon\"]\n", " mm = self.hyperparameters[\"momentum\"]\n", " rm = self.parameters[\"running_mean\"]\n", " rv = self.parameters[\"running_var\"]\n", "\n", " scaler = self.parameters[\"scaler\"]\n", " intercept = self.parameters[\"intercept\"]\n", "\n", " # if the layer is frozen, use our running mean/std values rather\n", " # than the mean/std values for the new batch\n", " X_mean = self.parameters[\"running_mean\"]\n", " X_var = self.parameters[\"running_var\"]\n", "\n", " if self.trainable and retain_derived:\n", " X_mean, X_var = X.mean(axis=0), X.var(axis=0) # , ddof=1)\n", " self.parameters[\"running_mean\"] = mm * rm + (1.0 - mm) * X_mean\n", " self.parameters[\"running_var\"] = mm * rv + (1.0 - mm) * X_var\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", "\n", " N = (X - X_mean) / np.sqrt(X_var + ep)\n", " y = scaler * N + intercept\n", " return y\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx, dScaler, dIntercept = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"scaler\"] += dScaler\n", " self.gradients[\"intercept\"] += dIntercept\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"Computation of gradient of loss wrt X, scaler, and intercept\"\"\"\n", " scaler = self.parameters[\"scaler\"]\n", " ep = self.hyperparameters[\"epsilon\"]\n", "\n", " n_ex, n_in = X.shape\n", " X_mean, X_var = X.mean(axis=0), X.var(axis=0) # , ddof=1)\n", "\n", " N = (X - X_mean) / np.sqrt(X_var + ep)\n", " dIntercept = dLdy.sum(axis=0)\n", " dScaler = np.sum(dLdy * N, axis=0)\n", "\n", " dN = dLdy * scaler\n", " dX = (n_ex * dN - dN.sum(axis=0) - N * (dN * N).sum(axis=0)) / (\n", " n_ex * np.sqrt(X_var + ep)\n", " )\n", "\n", " return dX, dScaler, dIntercept\n", "\n", "\n", "class LayerNorm2D(LayerBase):\n", " def __init__(self, epsilon=1e-5, optimizer=None):\n", " \"\"\"\n", " A layer normalization layer for 2D inputs with an additional channel\n", " dimension.\n", "\n", " Notes\n", " -----\n", " In contrast to :class:`BatchNorm2D`, the LayerNorm layer calculates the\n", " mean and variance across *features* rather than examples in the batch\n", " ensuring that the mean and variance estimates are independent of batch\n", " size and permitting straightforward application in RNNs.\n", "\n", " Equations [train & test]::\n", "\n", " Y = scaler * norm(X) + intercept\n", " norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)\n", "\n", " Also in contrast to :class:`BatchNorm2D`, `scaler` and `intercept` are applied\n", " *elementwise* to ``norm(X)``.\n", "\n", " Parameters\n", " ----------\n", " epsilon : float\n", " A small smoothing constant to use during computation of ``norm(X)``\n", " to avoid divide-by-zero errors. Default is 1e-5.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.in_ch = None\n", " self.out_ch = None\n", " self.epsilon = epsilon\n", " self.parameters = {\"scaler\": None, \"intercept\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self, X_shape):\n", " n_ex, in_rows, in_cols, in_ch = X_shape\n", "\n", " scaler = np.random.rand(in_rows, in_cols, in_ch)\n", " intercept = np.zeros((in_rows, in_cols, in_ch))\n", "\n", " self.parameters = {\"scaler\": scaler, \"intercept\": intercept}\n", "\n", " self.gradients = {\n", " \"scaler\": np.zeros_like(scaler),\n", " \"intercept\": np.zeros_like(intercept),\n", " }\n", "\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"LayerNorm2D\",\n", " \"act_fn\": None,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"epsilon\": self.epsilon,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Notes\n", " -----\n", " Equations [train & test]::\n", "\n", " Y = scaler * norm(X) + intercept\n", " norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume containing the `in_rows` by `in_cols`-dimensional\n", " features for a minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\" # noqa: E501\n", " if not self.is_initialized:\n", " self.in_ch = self.out_ch = X.shape[3]\n", " self._init_params(X.shape)\n", "\n", " scaler = self.parameters[\"scaler\"]\n", " ep = self.hyperparameters[\"epsilon\"]\n", " intercept = self.parameters[\"intercept\"]\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", "\n", " X_var = X.var(axis=(1, 2, 3), keepdims=True)\n", " X_mean = X.mean(axis=(1, 2, 3), keepdims=True)\n", " lnorm = (X - X_mean) / np.sqrt(X_var + ep)\n", " y = scaler * lnorm + intercept\n", " return y\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx, dScaler, dIntercept = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"scaler\"] += dScaler\n", " self.gradients[\"intercept\"] += dIntercept\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dy, X):\n", " \"\"\"Computation of gradient of the loss wrt X, scaler, intercept\"\"\"\n", " scaler = self.parameters[\"scaler\"]\n", " ep = self.hyperparameters[\"epsilon\"]\n", "\n", " X_mean = X.mean(axis=(1, 2, 3), keepdims=True)\n", " X_var = X.var(axis=(1, 2, 3), keepdims=True)\n", " lnorm = (X - X_mean) / np.sqrt(X_var + ep)\n", "\n", " dLnorm = dy * scaler\n", " dIntercept = dy.sum(axis=0)\n", " dScaler = np.sum(dy * lnorm, axis=0)\n", "\n", " n_in = np.prod(X.shape[1:])\n", " lnorm = lnorm.reshape(-1, n_in)\n", " dLnorm = dLnorm.reshape(lnorm.shape)\n", " X_var = X_var.reshape(X_var.shape[:2])\n", "\n", " dX = (\n", " n_in * dLnorm\n", " - dLnorm.sum(axis=1, keepdims=True)\n", " - lnorm * (dLnorm * lnorm).sum(axis=1, keepdims=True)\n", " ) / (n_in * np.sqrt(X_var + ep))\n", "\n", " # reshape X gradients back to proper dimensions\n", " return np.reshape(dX, X.shape), dScaler, dIntercept\n", "\n", "\n", "class LayerNorm1D(LayerBase):\n", " def __init__(self, epsilon=1e-5, optimizer=None):\n", " \"\"\"\n", " A layer normalization layer for 1D inputs.\n", "\n", " Notes\n", " -----\n", " In contrast to :class:`BatchNorm1D`, the LayerNorm layer calculates the\n", " mean and variance across *features* rather than examples in the batch\n", " ensuring that the mean and variance estimates are independent of batch\n", " size and permitting straightforward application in RNNs.\n", "\n", " Equations [train & test]::\n", "\n", " Y = scaler * norm(X) + intercept\n", " norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)\n", "\n", " Also in contrast to :class:`BatchNorm1D`, `scaler` and `intercept` are applied\n", " *elementwise* to ``norm(X)``.\n", "\n", " Parameters\n", " ----------\n", " epsilon : float\n", " A small smoothing constant to use during computation of ``norm(X)``\n", " to avoid divide-by-zero errors. Default is 1e-5.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.n_in = None\n", " self.n_out = None\n", " self.epsilon = epsilon\n", " self.parameters = {\"scaler\": None, \"intercept\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " scaler = np.random.rand(self.n_in)\n", " intercept = np.zeros(self.n_in)\n", "\n", " self.parameters = {\"scaler\": scaler, \"intercept\": intercept}\n", "\n", " self.gradients = {\n", " \"scaler\": np.zeros_like(scaler),\n", " \"intercept\": np.zeros_like(intercept),\n", " }\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"LayerNorm1D\",\n", " \"act_fn\": None,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"epsilon\": self.epsilon,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = self.n_out = X.shape[1]\n", " self._init_params()\n", "\n", " scaler = self.parameters[\"scaler\"]\n", " ep = self.hyperparameters[\"epsilon\"]\n", " intercept = self.parameters[\"intercept\"]\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", "\n", " X_mean, X_var = X.mean(axis=1, keepdims=True), X.var(axis=1, keepdims=True)\n", " lnorm = (X - X_mean) / np.sqrt(X_var + ep)\n", " y = scaler * lnorm + intercept\n", " return y\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx, dScaler, dIntercept = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"scaler\"] += dScaler\n", " self.gradients[\"intercept\"] += dIntercept\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"Computation of gradient of the loss wrt X, scaler, intercept\"\"\"\n", " scaler = self.parameters[\"scaler\"]\n", " ep = self.hyperparameters[\"epsilon\"]\n", "\n", " n_ex, n_in = X.shape\n", " X_mean, X_var = X.mean(axis=1, keepdims=True), X.var(axis=1, keepdims=True)\n", "\n", " lnorm = (X - X_mean) / np.sqrt(X_var + ep)\n", " dIntercept = dLdy.sum(axis=0)\n", " dScaler = np.sum(dLdy * lnorm, axis=0)\n", "\n", " dLnorm = dLdy * scaler\n", " dX = (\n", " n_in * dLnorm\n", " - dLnorm.sum(axis=1, keepdims=True)\n", " - lnorm * (dLnorm * lnorm).sum(axis=1, keepdims=True)\n", " ) / (n_in * np.sqrt(X_var + ep))\n", "\n", " return dX, dScaler, dIntercept\n", "\n", "\n", "#######################################################################\n", "# MLP Layers #\n", "#######################################################################\n", "\n", "\n", "class Embedding(LayerBase):\n", " def __init__(\n", " self, n_out, vocab_size, pool=None, init=\"glorot_uniform\", optimizer=None,\n", " ):\n", " \"\"\"\n", " An embedding layer.\n", "\n", " Notes\n", " -----\n", " Equations::\n", "\n", " Y = W[x]\n", "\n", " NB. This layer must be the first in a neural network as the gradients\n", " do not get passed back through to the inputs.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimensionality of the embeddings\n", " vocab_size : int\n", " The total number of items in the vocabulary. All integer indices\n", " are expected to range between 0 and `vocab_size - 1`.\n", " pool : {'sum', 'mean', None}\n", " If not None, apply this function to the collection of `n_in`\n", " encodings in each example to produce a single, pooled embedding.\n", " Default is None.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", " fstr = \"'pool' must be either 'sum', 'mean', or None but got '{}'\"\n", " assert pool in [\"sum\", \"mean\", None], fstr.format(pool)\n", "\n", " self.init = init\n", " self.pool = pool\n", " self.n_out = n_out\n", " self.vocab_size = vocab_size\n", " self.parameters = {\"W\": None}\n", " self.is_initialized = False\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(\"Affine(slope=1, intercept=0)\", mode=self.init)\n", " W = init_weights((self.vocab_size, self.n_out))\n", "\n", " self.parameters = {\"W\": W}\n", " self.derived_variables = {}\n", " self.gradients = {\"W\": np.zeros_like(W)}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Embedding\",\n", " \"init\": self.init,\n", " \"pool\": self.pool,\n", " \"n_out\": self.n_out,\n", " \"vocab_size\": self.vocab_size,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def lookup(self, ids):\n", " \"\"\"\n", " Return the embeddings associated with the IDs in `ids`.\n", "\n", " Parameters\n", " ----------\n", " word_ids : :py:class:`ndarray ` of shape (`M`,)\n", " An array of `M` IDs to retrieve embeddings for.\n", "\n", " Returns\n", " -------\n", " embeddings : :py:class:`ndarray ` of shape (`M`, `n_out`)\n", " The embedding vectors for each of the `M` IDs.\n", " \"\"\"\n", " return self.parameters[\"W\"][ids]\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Notes\n", " -----\n", " Equations:\n", " Y = W[x]\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)` or list of length `n_ex`\n", " Layer input, representing a minibatch of `n_ex` examples. If\n", " ``self.pool`` is None, each example must consist of exactly `n_in`\n", " integer token IDs. Otherwise, `X` can be a ragged array, with each\n", " example consisting of a variable number of token IDs.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through with regard to this input.\n", " Default is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_in, n_out)`\n", " Embeddings for each coordinate of each of the `n_ex` examples\n", " \"\"\" # noqa: E501\n", " # if X is a ragged array\n", " if isinstance(X, list) and not issubclass(X[0].dtype.type, np.integer):\n", " fstr = \"Input to Embedding layer must be an array of integers, got '{}'\"\n", " raise TypeError(fstr.format(X[0].dtype.type))\n", "\n", " # otherwise\n", " if isinstance(X, np.ndarray) and not issubclass(X.dtype.type, np.integer):\n", " fstr = \"Input to Embedding layer must be an array of integers, got '{}'\"\n", " raise TypeError(fstr.format(X.dtype.type))\n", "\n", " Y = self._fwd(X)\n", " if retain_derived:\n", " self.X.append(X)\n", " return Y\n", "\n", " def _fwd(self, X):\n", " \"\"\"Actual computation of forward pass\"\"\"\n", " W = self.parameters[\"W\"]\n", " if self.pool is None:\n", " emb = W[X]\n", " elif self.pool == \"sum\":\n", " emb = np.array([W[x].sum(axis=0) for x in X])[:, None, :]\n", " elif self.pool == \"mean\":\n", " emb = np.array([W[x].mean(axis=0) for x in X])[:, None, :]\n", " return emb\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to embedding weights.\n", "\n", " Notes\n", " -----\n", " Because the items in `X` are interpreted as indices, we cannot compute\n", " the gradient of the layer output wrt. `X`.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, n_in, n_out)` or list of arrays\n", " The gradient(s) of the loss wrt. the layer output(s)\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " for dy, x in zip(dLdy, self.X):\n", " dw = self._bwd(dy, x)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"Actual computation of gradient of the loss wrt. W\"\"\"\n", " dW = np.zeros_like(self.parameters[\"W\"])\n", " dLdy = dLdy.reshape(-1, self.n_out)\n", "\n", " if self.pool is None:\n", " for ix, v_id in enumerate(X.flatten()):\n", " dW[v_id] += dLdy[ix]\n", " elif self.pool == \"sum\":\n", " for ix, v_ids in enumerate(X):\n", " dW[v_ids] += dLdy[ix]\n", " elif self.pool == \"mean\":\n", " for ix, v_ids in enumerate(X):\n", " dW[v_ids] += dLdy[ix] / len(v_ids)\n", " return dW\n", "\n", "\n", "class FullyConnected(LayerBase):\n", " def __init__(self, n_out, act_fn=None, init=\"glorot_uniform\", optimizer=None):\n", " r\"\"\"\n", " A fully-connected (dense) layer.\n", "\n", " Notes\n", " -----\n", " A fully connected layer computes the function\n", "\n", " .. math::\n", "\n", " \\mathbf{Y} = f( \\mathbf{WX} + \\mathbf{b} )\n", "\n", " where `f` is the activation nonlinearity, **W** and **b** are\n", " parameters of the layer, and **X** is the minibatch of input examples.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimensionality of the layer output\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The element-wise output nonlinearity used in computing `Y`. If None,\n", " use the identity function :math:`f(X) = X`. Default is None.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.parameters = {\"W\": None, \"b\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " b = np.zeros((1, self.n_out))\n", " W = init_weights((self.n_in, self.n_out))\n", "\n", " self.parameters = {\"W\": W, \"b\": b}\n", " self.derived_variables = {\"Z\": []}\n", " self.gradients = {\"W\": np.zeros_like(W), \"b\": np.zeros_like(b)}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"FullyConnected\",\n", " \"init\": self.init,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = X.shape[1]\n", " self._init_params()\n", "\n", " Y, Z = self._fwd(X)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"Z\"].append(Z)\n", "\n", " return Y\n", "\n", " def _fwd(self, X):\n", " \"\"\"Actual computation of forward pass\"\"\"\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " Z = X @ W + b\n", " Y = self.act_fn(Z)\n", " return Y, Z\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, n_out)` or list of arrays\n", " The gradient(s) of the loss wrt. the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in)` or list of arrays\n", " The gradient of the loss wrt. the layer input(s) `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx, dw, db = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", " self.gradients[\"b\"] += db\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"Actual computation of gradient of the loss wrt. X, W, and b\"\"\"\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " Z = X @ W + b\n", " dZ = dLdy * self.act_fn.grad(Z)\n", "\n", " dX = dZ @ W.T\n", " dW = X.T @ dZ\n", " dB = dZ.sum(axis=0, keepdims=True)\n", " return dX, dW, dB\n", "\n", " def _bwd2(self, dLdy, X, dLdy_bwd):\n", " \"\"\"Compute second derivatives / deriv. of loss wrt. dX, dW, and db\"\"\"\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " dZ = self.act_fn.grad(X @ W + b)\n", " ddZ = self.act_fn.grad2(X @ W + b)\n", "\n", " ddX = dLdy @ W * dZ\n", " ddW = dLdy.T @ (dLdy_bwd * dZ)\n", " ddB = np.sum(dLdy @ W * dLdy_bwd * ddZ, axis=0, keepdims=True)\n", " return ddX, ddW, ddB\n", "\n", "\n", "class Softmax(LayerBase):\n", " def __init__(self, dim=-1, optimizer=None):\n", " r\"\"\"\n", " A softmax nonlinearity layer.\n", "\n", " Notes\n", " -----\n", " This is implemented as a layer rather than an activation primarily\n", " because it requires retaining the layer input in order to compute the\n", " softmax gradients properly. In other words, in contrast to other\n", " simple activations, the softmax function and its gradient are not\n", " computed elementwise, and thus are more easily expressed as a layer.\n", "\n", " The softmax function computes:\n", "\n", " .. math::\n", "\n", " y_i = \\frac{e^{x_i}}{\\sum_j e^{x_j}}\n", "\n", " where :math:`x_i` is the `i` th element of input example **x**.\n", "\n", " Parameters\n", " ----------\n", " dim: int\n", " The dimension in `X` along which the softmax will be computed.\n", " Default is -1.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None. Unused for this layer.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.dim = dim\n", " self.n_in = None\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self.gradients = {}\n", " self.parameters = {}\n", " self.derived_variables = {}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"SoftmaxLayer\",\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_in,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = X.shape[1]\n", " self._init_params()\n", "\n", " Y = self._fwd(X)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", "\n", " return Y\n", "\n", " def _fwd(self, X):\n", " \"\"\"Actual computation of softmax forward pass\"\"\"\n", " # center data to avoid overflow\n", " e_X = np.exp(X - np.max(X, axis=self.dim, keepdims=True))\n", " return e_X / e_X.sum(axis=self.dim, keepdims=True)\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, n_out)` or list of arrays\n", " The gradient(s) of the loss wrt. the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"\n", " Actual computation of the gradient of the loss wrt. the input X.\n", "\n", " The Jacobian, J, of the softmax for input x = [x1, ..., xn] is:\n", " J[i, j] =\n", " softmax(x_i) * (1 - softmax(x_j)) if i = j\n", " -softmax(x_i) * softmax(x_j) if i != j\n", " where\n", " x_n is input example n (ie., the n'th row in X)\n", " \"\"\"\n", " dX = []\n", " for dy, x in zip(dLdy, X):\n", " dxi = []\n", " for dyi, xi in zip(*np.atleast_2d(dy, x)):\n", " yi = self._fwd(xi.reshape(1, -1)).reshape(-1, 1)\n", " dyidxi = np.diagflat(yi) - yi @ yi.T # jacobian wrt. input sample xi\n", " dxi.append(dyi @ dyidxi)\n", " dX.append(dxi)\n", " return np.array(dX).reshape(*X.shape)\n", "\n", "\n", "class SparseEvolution(LayerBase):\n", " def __init__(\n", " self,\n", " n_out,\n", " zeta=0.3,\n", " epsilon=20,\n", " act_fn=None,\n", " init=\"glorot_uniform\",\n", " optimizer=None,\n", " ):\n", " r\"\"\"\n", " A sparse Erdos-Renyi layer with evolutionary rewiring via the sparse\n", " evolutionary training (SET) algorithm.\n", "\n", " Notes\n", " -----\n", " .. math::\n", "\n", " Y = f( (\\mathbf{W} \\odot \\mathbf{W}_{mask}) \\mathbf{X} + \\mathbf{b} )\n", "\n", " where :math:`\\odot` is the elementwise multiplication operation, `f` is\n", " the layer activation function, and :math:`\\mathbf{W}_{mask}` is an\n", " evolved binary mask.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimensionality of the layer output\n", " zeta : float\n", " Proportion of the positive and negative weights closest to zero to\n", " drop after each training update. Default is 0.3.\n", " epsilon : float\n", " Layer sparsity parameter. Default is 20.\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The element-wise output nonlinearity used in computing `Y`. If None,\n", " use the identity function :math:`f(X) = X`. Default is None.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with default\n", " parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.zeta = zeta\n", " self.n_out = n_out\n", " self.epsilon = epsilon\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.parameters = {\"W\": None, \"b\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " b = np.zeros((1, self.n_out))\n", " W = init_weights((self.n_in, self.n_out))\n", "\n", " # convert a fully connected base layer into a sparse layer\n", " n_in, n_out = W.shape\n", " p = (self.epsilon * (n_in + n_out)) / (n_in * n_out)\n", " mask = np.random.binomial(1, p, shape=W.shape)\n", "\n", " self.derived_variables = {\"Z\": []}\n", " self.parameters = {\"W\": W, \"b\": b, \"W_mask\": mask}\n", " self.gradients = {\"W\": np.zeros_like(W), \"b\": np.zeros_like(b)}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"SparseEvolutionary\",\n", " \"init\": self.init,\n", " \"zeta\": self.zeta,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"epsilon\": self.epsilon,\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing the `n_in`-dimensional features for a\n", " minibatch of `n_ex` examples.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " Layer output for each of the `n_ex` examples.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = X.shape[1]\n", " self._init_params()\n", "\n", " Y, Z = self._fwd(X)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"Z\"].append(Z)\n", "\n", " return Y\n", "\n", " def _fwd(self, X):\n", " \"\"\"Actual computation of forward pass\"\"\"\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", " W_mask = self.parameters[\"W_mask\"]\n", "\n", " Z = X @ (W * W_mask) + b\n", " Y = self.act_fn(Z)\n", " return Y, Z\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, n_out)` or list of arrays\n", " The gradient(s) of the loss wrt. the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " for dy, x in zip(dLdy, X):\n", " dx, dw, db = self._bwd(dy, x)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", " self.gradients[\"b\"] += db\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X):\n", " \"\"\"Actual computation of gradient of the loss wrt. X, W, and b\"\"\"\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", " W_sparse = W * self.parameters[\"W_mask\"]\n", "\n", " Z = X @ W_sparse + b\n", " dZ = dLdy * self.act_fn.grad(Z)\n", "\n", " dX = dZ @ W_sparse.T\n", " dW = X.T @ dZ\n", " dB = dZ.sum(axis=0, keepdims=True)\n", " return dX, dW, dB\n", "\n", " def _bwd2(self, dLdy, X, dLdy_bwd):\n", " \"\"\"Compute second derivatives / deriv. of loss wrt. dX, dW, and db\"\"\"\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", " W_sparse = W * self.parameters[\"W_mask\"]\n", "\n", " dZ = self.act_fn.grad(X @ W_sparse + b)\n", " ddZ = self.act_fn.grad2(X @ W_sparse + b)\n", "\n", " ddX = dLdy @ W * dZ\n", " ddW = dLdy.T @ (dLdy_bwd * dZ)\n", " ddB = np.sum(dLdy @ W_sparse * dLdy_bwd * ddZ, axis=0, keepdims=True)\n", " return ddX, ddW, ddB\n", "\n", " def update(self):\n", " \"\"\"\n", " Update parameters using current gradients and evolve network\n", " connections via SET.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", " for k, v in self.gradients.items():\n", " if k in self.parameters:\n", " self.parameters[k] = self.optimizer(self.parameters[k], v, k)\n", " self.flush_gradients()\n", " self._evolve_connections()\n", "\n", " def _evolve_connections(self):\n", " assert self.trainable, \"Layer is frozen\"\n", " W = self.parameters[\"W\"]\n", " W_mask = self.parameters[\"W_mask\"]\n", " W_flat = (W * W_mask).reshape(-1)\n", "\n", " k = int(np.prod(W.shape) * self.zeta)\n", "\n", " (p_ix,) = np.where(W_flat > 0)\n", " (n_ix,) = np.where(W_flat < 0)\n", "\n", " # remove the k largest negative and k smallest positive weights\n", " k_smallest_p = p_ix[np.argsort(W_flat[p_ix])][:k]\n", " k_largest_n = n_ix[np.argsort(W_flat[n_ix])][-k:]\n", " n_rewired = len(k_smallest_p) + len(k_largest_n)\n", "\n", " self.mask = np.ones_like(W_flat)\n", " self.mask[k_largest_n] = 0\n", " self.mask[k_smallest_p] = 0\n", "\n", " zero_ixs = np.where(self.mask == 0)\n", "\n", " # resample new connections and update mask\n", " np.shuffle(zero_ixs)\n", " self.mask[zero_ixs[:n_rewired]] = 1\n", " self.mask = self.mask.reshape(*W.shape)\n", "\n", "\n", "#######################################################################\n", "# Convolutional Layers #\n", "#######################################################################\n", "\n", "\n", "class Conv1D(LayerBase):\n", " def __init__(\n", " self,\n", " out_ch,\n", " kernel_width,\n", " pad=0,\n", " stride=1,\n", " dilation=0,\n", " act_fn=None,\n", " init=\"glorot_uniform\",\n", " optimizer=None,\n", " ):\n", " \"\"\"\n", " Apply a one-dimensional convolution kernel over an input volume.\n", "\n", " Notes\n", " -----\n", " Equations::\n", "\n", " out = act_fn(pad(X) * W + b)\n", " out_dim = floor(1 + (n_rows_in + pad_left + pad_right - kernel_width) / stride)\n", "\n", " where '`*`' denotes the cross-correlation operation with stride `s` and dilation `d`.\n", "\n", " Parameters\n", " ----------\n", " out_ch : int\n", " The number of filters/kernels to compute in the current layer\n", " kernel_width : int\n", " The width of a single 1D filter/kernel in the current layer\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``Y[t]``. If None, use the\n", " identity function :math:`f(x) = x` by default. Default is None.\n", " pad : int, tuple, or {'same', 'causal'}\n", " The number of rows/columns to zero-pad the input with. If `'same'`,\n", " calculate padding to ensure the output length matches in the input\n", " length. If `'causal'` compute padding such that the output both has\n", " the same length as the input AND ``output[t]`` does not depend on\n", " ``input[t + 1:]``. Default is 0.\n", " stride : int\n", " The stride/hop of the convolution kernels as they move over the\n", " input volume. Default is 1.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Effective kernel\n", " shape after dilation is: ``[kernel_rows * (d + 1) - d, kernel_cols\n", " * (d + 1) - d]``. Default is 0.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.pad = pad\n", " self.init = init\n", " self.in_ch = None\n", " self.out_ch = out_ch\n", " self.stride = stride\n", " self.dilation = dilation\n", " self.kernel_width = kernel_width\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.parameters = {\"W\": None, \"b\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " W = init_weights((self.kernel_width, self.in_ch, self.out_ch))\n", " b = np.zeros((1, 1, self.out_ch))\n", "\n", " self.parameters = {\"W\": W, \"b\": b}\n", " self.gradients = {\"W\": np.zeros_like(W), \"b\": np.zeros_like(b)}\n", " self.derived_variables = {\"Z\": [], \"out_rows\": [], \"out_cols\": []}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Conv1D\",\n", " \"pad\": self.pad,\n", " \"init\": self.init,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"stride\": self.stride,\n", " \"dilation\": self.dilation,\n", " \"act_fn\": str(self.act_fn),\n", " \"kernel_width\": self.kernel_width,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output given input volume `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, l_in, in_ch)`\n", " The input volume consisting of `n_ex` examples, each of length\n", " `l_in` and with `in_ch` input channels\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, l_out, out_ch)`\n", " The layer output.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.in_ch = X.shape[2]\n", " self._init_params()\n", "\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " n_ex, l_in, in_ch = X.shape\n", " s, p, d = self.stride, self.pad, self.dilation\n", "\n", " # pad the input and perform the forward convolution\n", " Z = conv1D(X, W, s, p, d) + b\n", " Y = self.act_fn(Z)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"Z\"].append(Z)\n", " self.derived_variables[\"out_rows\"].append(Z.shape[1])\n", " self.derived_variables[\"out_cols\"].append(Z.shape[2])\n", "\n", " return Y\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Compute the gradient of the loss with respect to the layer parameters.\n", "\n", " Notes\n", " -----\n", " Relies on :meth:`~numpy_ml.neural_nets.utils.im2col` and\n", " :meth:`~numpy_ml.neural_nets.utils.col2im` to vectorize the\n", " gradient calculation. See the private method :meth:`_backward_naive`\n", " for a more straightforward implementation.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, l_out, out_ch)` or list of arrays\n", " The gradient(s) of the loss with respect to the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, l_in, in_ch)`\n", " The gradient of the loss with respect to the layer input volume.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " X = self.X\n", " Z = self.derived_variables[\"Z\"]\n", "\n", " dX = []\n", " for dy, x, z in zip(dLdy, X, Z):\n", " dx, dw, db = self._bwd(dy, x, z)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", " self.gradients[\"b\"] += db\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X, Z):\n", " \"\"\"Actual computation of gradient of the loss wrt. X, W, and b\"\"\"\n", " W = self.parameters[\"W\"]\n", "\n", " # add a row dimension to X, W, and dZ to permit us to use im2col/col2im\n", " X2D = np.expand_dims(X, axis=1)\n", " W2D = np.expand_dims(W, axis=0)\n", " dLdZ = np.expand_dims(dLdy * self.act_fn.grad(Z), axis=1)\n", "\n", " d = self.dilation\n", " fr, fc, in_ch, out_ch = W2D.shape\n", " n_ex, l_out, out_ch = dLdy.shape\n", " fr, fc, s = 1, self.kernel_width, self.stride\n", "\n", " # use pad1D here in order to correctly handle self.pad = 'causal',\n", " # which isn't defined for pad2D\n", " _, p = pad1D(X, self.pad, self.kernel_width, s, d)\n", " p2D = (0, 0, p[0], p[1])\n", "\n", " # columnize W, X, and dLdy\n", " dLdZ_col = dLdZ.transpose(3, 1, 2, 0).reshape(out_ch, -1)\n", " W_col = W2D.transpose(3, 2, 0, 1).reshape(out_ch, -1).T\n", " X_col, _ = im2col(X2D, W2D.shape, p2D, s, d)\n", "\n", " # compute gradients via matrix multiplication and reshape\n", " dB = dLdZ_col.sum(axis=1).reshape(1, 1, -1)\n", " dW = (dLdZ_col @ X_col.T).reshape(out_ch, in_ch, fr, fc).transpose(2, 3, 1, 0)\n", "\n", " # reshape columnized dX back into the same format as the input volume\n", " dX_col = W_col @ dLdZ_col\n", " dX = col2im(dX_col, X2D.shape, W2D.shape, p2D, s, d).transpose(0, 2, 3, 1)\n", "\n", " return np.squeeze(dX, axis=1), np.squeeze(dW, axis=0), dB\n", "\n", " def _backward_naive(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " A slower (ie., non-vectorized) but more straightforward implementation\n", " of the gradient computations for a 2D conv layer.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, l_out, out_ch)` or list of arrays\n", " The gradient(s) of the loss with respect to the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, l_in, in_ch)`\n", " The gradient of the loss with respect to the layer input volume.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", " Zs = self.derived_variables[\"Z\"]\n", "\n", " Xs, d = self.X, self.dilation\n", " fw, s, p = self.kernel_width, self.stride, self.pad\n", "\n", " dXs = []\n", " for X, Z, dy in zip(Xs, Zs, dLdy):\n", " n_ex, l_out, out_ch = dy.shape\n", " X_pad, (pr1, pr2) = pad1D(X, p, self.kernel_width, s, d)\n", "\n", " dX = np.zeros_like(X_pad)\n", " dZ = dy * self.act_fn.grad(Z)\n", "\n", " dW, dB = np.zeros_like(W), np.zeros_like(b)\n", " for m in range(n_ex):\n", " for i in range(l_out):\n", " for c in range(out_ch):\n", " # compute window boundaries w. stride and dilation\n", " i0, i1 = i * s, (i * s) + fw * (d + 1) - d\n", "\n", " wc = W[:, :, c]\n", " kernel = dZ[m, i, c]\n", " window = X_pad[m, i0 : i1 : (d + 1), :]\n", "\n", " dB[:, :, c] += kernel\n", " dW[:, :, c] += window * kernel\n", " dX[m, i0 : i1 : (d + 1), :] += wc * kernel\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dW\n", " self.gradients[\"b\"] += dB\n", "\n", " pr2 = None if pr2 == 0 else -pr2\n", " dXs.append(dX[:, pr1:pr2, :])\n", " return dXs[0] if len(Xs) == 1 else dXs\n", "\n", "\n", "class Conv2D(LayerBase):\n", " def __init__(\n", " self,\n", " out_ch,\n", " kernel_shape,\n", " pad=0,\n", " stride=1,\n", " dilation=0,\n", " act_fn=None,\n", " optimizer=None,\n", " init=\"glorot_uniform\",\n", " ):\n", " \"\"\"\n", " Apply a two-dimensional convolution kernel over an input volume.\n", "\n", " Notes\n", " -----\n", " Equations::\n", "\n", " out = act_fn(pad(X) * W + b)\n", " n_rows_out = floor(1 + (n_rows_in + pad_left + pad_right - filter_rows) / stride)\n", " n_cols_out = floor(1 + (n_cols_in + pad_top + pad_bottom - filter_cols) / stride)\n", "\n", " where `'*'` denotes the cross-correlation operation with stride `s` and\n", " dilation `d`.\n", "\n", " Parameters\n", " ----------\n", " out_ch : int\n", " The number of filters/kernels to compute in the current layer\n", " kernel_shape : 2-tuple\n", " The dimension of a single 2D filter/kernel in the current layer\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``Y[t]``. If None, use the\n", " identity function :math:`f(X) = X` by default. Default is None.\n", " pad : int, tuple, or 'same'\n", " The number of rows/columns to zero-pad the input with. Default is\n", " 0.\n", " stride : int\n", " The stride/hop of the convolution kernels as they move over the\n", " input volume. Default is 1.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Effective kernel\n", " shape after dilation is: ``[kernel_rows * (d + 1) - d, kernel_cols\n", " * (d + 1) - d]``. Default is 0.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " X : list\n", " Running list of inputs to the :meth:`forward ` method since the last call to :meth:`update `. Only updated if the `retain_derived` argument was set to True.\n", " gradients : dict\n", " Dictionary of loss gradients with regard to the layer parameters\n", " parameters : dict\n", " Dictionary of layer parameters\n", " hyperparameters : dict\n", " Dictionary of layer hyperparameters\n", " derived_variables : dict\n", " Dictionary of any intermediate values computed during\n", " forward/backward propagation.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.pad = pad\n", " self.init = init\n", " self.in_ch = None\n", " self.out_ch = out_ch\n", " self.stride = stride\n", " self.dilation = dilation\n", " self.kernel_shape = kernel_shape\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.parameters = {\"W\": None, \"b\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " fr, fc = self.kernel_shape\n", " W = init_weights((fr, fc, self.in_ch, self.out_ch))\n", " b = np.zeros((1, 1, 1, self.out_ch))\n", "\n", " self.parameters = {\"W\": W, \"b\": b}\n", " self.gradients = {\"W\": np.zeros_like(W), \"b\": np.zeros_like(b)}\n", " self.derived_variables = {\"Z\": [], \"out_rows\": [], \"out_cols\": []}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Conv2D\",\n", " \"pad\": self.pad,\n", " \"init\": self.init,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"stride\": self.stride,\n", " \"dilation\": self.dilation,\n", " \"act_fn\": str(self.act_fn),\n", " \"kernel_shape\": self.kernel_shape,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output given input volume `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The input volume consisting of `n_ex` examples, each with dimension\n", " (`in_rows`, `in_cols`, `in_ch`).\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The layer output.\n", " \"\"\" # noqa: E501\n", " if not self.is_initialized:\n", " self.in_ch = X.shape[3]\n", " self._init_params()\n", "\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " n_ex, in_rows, in_cols, in_ch = X.shape\n", " s, p, d = self.stride, self.pad, self.dilation\n", "\n", " # pad the input and perform the forward convolution\n", " Z = conv2D(X, W, s, p, d) + b\n", " Y = self.act_fn(Z)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"Z\"].append(Z)\n", " self.derived_variables[\"out_rows\"].append(Z.shape[1])\n", " self.derived_variables[\"out_cols\"].append(Z.shape[2])\n", "\n", " return Y\n", "\n", " def backward(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " Compute the gradient of the loss with respect to the layer parameters.\n", "\n", " Notes\n", " -----\n", " Relies on :meth:`~numpy_ml.neural_nets.utils.im2col` and\n", " :meth:`~numpy_ml.neural_nets.utils.col2im` to vectorize the\n", " gradient calculation.\n", "\n", " See the private method :meth:`_backward_naive` for a more straightforward\n", " implementation.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)` or list of arrays\n", " The gradient(s) of the loss with respect to the layer output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss with respect to the layer input volume.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " dX = []\n", " X = self.X\n", " Z = self.derived_variables[\"Z\"]\n", "\n", " for dy, x, z in zip(dLdy, X, Z):\n", " dx, dw, db = self._bwd(dy, x, z)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", " self.gradients[\"b\"] += db\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdy, X, Z):\n", " \"\"\"Actual computation of gradient of the loss wrt. X, W, and b\"\"\"\n", " W = self.parameters[\"W\"]\n", "\n", " d = self.dilation\n", " fr, fc, in_ch, out_ch = W.shape\n", " n_ex, out_rows, out_cols, out_ch = dLdy.shape\n", " (fr, fc), s, p = self.kernel_shape, self.stride, self.pad\n", "\n", " # columnize W, X, and dLdy\n", " dLdZ = dLdy * self.act_fn.grad(Z)\n", " dLdZ_col = dLdZ.transpose(3, 1, 2, 0).reshape(out_ch, -1)\n", " W_col = W.transpose(3, 2, 0, 1).reshape(out_ch, -1).T\n", " X_col, p = im2col(X, W.shape, p, s, d)\n", "\n", " # compute gradients via matrix multiplication and reshape\n", " dB = dLdZ_col.sum(axis=1).reshape(1, 1, 1, -1)\n", " dW = (dLdZ_col @ X_col.T).reshape(out_ch, in_ch, fr, fc).transpose(2, 3, 1, 0)\n", "\n", " # reshape columnized dX back into the same format as the input volume\n", " dX_col = W_col @ dLdZ_col\n", " dX = col2im(dX_col, X.shape, W.shape, p, s, d).transpose(0, 2, 3, 1)\n", "\n", " return dX, dW, dB\n", "\n", " def _backward_naive(self, dLdy, retain_grads=True):\n", " \"\"\"\n", " A slower (ie., non-vectorized) but more straightforward implementation\n", " of the gradient computations for a 2D conv layer.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The gradient of the loss with respect to the layer output.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss with respect to the layer input volume.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdy, list):\n", " dLdy = [dLdy]\n", "\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", " Zs = self.derived_variables[\"Z\"]\n", "\n", " Xs, d = self.X, self.dilation\n", " (fr, fc), s, p = self.kernel_shape, self.stride, self.pad\n", "\n", " dXs = []\n", " for X, Z, dy in zip(Xs, Zs, dLdy):\n", " n_ex, out_rows, out_cols, out_ch = dy.shape\n", " X_pad, (pr1, pr2, pc1, pc2) = pad2D(X, p, self.kernel_shape, s, d)\n", "\n", " dZ = dLdy * self.act_fn.grad(Z)\n", "\n", " dX = np.zeros_like(X_pad)\n", " dW, dB = np.zeros_like(W), np.zeros_like(b)\n", " for m in range(n_ex):\n", " for i in range(out_rows):\n", " for j in range(out_cols):\n", " for c in range(out_ch):\n", " # compute window boundaries w. stride and dilation\n", " i0, i1 = i * s, (i * s) + fr * (d + 1) - d\n", " j0, j1 = j * s, (j * s) + fc * (d + 1) - d\n", "\n", " wc = W[:, :, :, c]\n", " kernel = dZ[m, i, j, c]\n", " window = X_pad[m, i0 : i1 : (d + 1), j0 : j1 : (d + 1), :]\n", "\n", " dB[:, :, :, c] += kernel\n", " dW[:, :, :, c] += window * kernel\n", " dX[m, i0 : i1 : (d + 1), j0 : j1 : (d + 1), :] += (\n", " wc * kernel\n", " )\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dW\n", " self.gradients[\"b\"] += dB\n", "\n", " pr2 = None if pr2 == 0 else -pr2\n", " pc2 = None if pc2 == 0 else -pc2\n", " dXs.append(dX[:, pr1:pr2, pc1:pc2, :])\n", " return dXs[0] if len(Xs) == 1 else dXs\n", "\n", "\n", "class Pool2D(LayerBase):\n", " def __init__(self, kernel_shape, stride=1, pad=0, mode=\"max\", optimizer=None):\n", " \"\"\"\n", " A single two-dimensional pooling layer.\n", "\n", " Parameters\n", " ----------\n", " kernel_shape : 2-tuple\n", " The dimension of a single 2D filter/kernel in the current layer\n", " stride : int\n", " The stride/hop of the convolution kernels as they move over the\n", " input volume. Default is 1.\n", " pad : int, tuple, or 'same'\n", " The number of rows/columns of 0's to pad the input. Default is 0.\n", " mode : {\"max\", \"average\"}\n", " The pooling function to apply.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.pad = pad\n", " self.mode = mode\n", " self.in_ch = None\n", " self.out_ch = None\n", " self.stride = stride\n", " self.kernel_shape = kernel_shape\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self.derived_variables = {\"out_rows\": [], \"out_cols\": []}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Pool2D\",\n", " \"act_fn\": None,\n", " \"pad\": self.pad,\n", " \"mode\": self.mode,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"stride\": self.stride,\n", " \"kernel_shape\": self.kernel_shape,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output given input volume `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The input volume consisting of `n_ex` examples, each with dimension\n", " (`in_rows`,`in_cols`, `in_ch`)\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The layer output.\n", " \"\"\" # noqa: E501\n", " if not self.is_initialized:\n", " self.in_ch = self.out_ch = X.shape[3]\n", " self._init_params()\n", "\n", " n_ex, in_rows, in_cols, nc_in = X.shape\n", " (fr, fc), s, p = self.kernel_shape, self.stride, self.pad\n", " X_pad, (pr1, pr2, pc1, pc2) = pad2D(X, p, self.kernel_shape, s)\n", "\n", " out_rows = np.floor(1 + (in_rows + pr1 + pr2 - fr) / s).astype(int)\n", " out_cols = np.floor(1 + (in_cols + pc1 + pc2 - fc) / s).astype(int)\n", "\n", " if self.mode == \"max\":\n", " pool_fn = np.max\n", " elif self.mode == \"average\":\n", " pool_fn = np.mean\n", "\n", " Y = np.zeros((n_ex, out_rows, out_cols, self.out_ch))\n", " for m in range(n_ex):\n", " for i in range(out_rows):\n", " for j in range(out_cols):\n", " for c in range(self.out_ch):\n", " # calculate window boundaries, incorporating stride\n", " i0, i1 = i * s, (i * s) + fr\n", " j0, j1 = j * s, (j * s) + fc\n", "\n", " xi = X_pad[m, i0:i1, j0:j1, c]\n", " Y[m, i, j, c] = pool_fn(xi)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"out_rows\"].append(out_rows)\n", " self.derived_variables[\"out_cols\"].append(out_cols)\n", "\n", " return Y\n", "\n", " def backward(self, dLdY, retain_grads=True):\n", " \"\"\"\n", " Backprop from layer outputs to inputs\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss wrt. the layer output `Y`.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss wrt. the layer input `X`.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdY, list):\n", " dLdY = [dLdY]\n", "\n", " Xs = self.X\n", " out_rows = self.derived_variables[\"out_rows\"]\n", " out_cols = self.derived_variables[\"out_cols\"]\n", "\n", " (fr, fc), s, p = self.kernel_shape, self.stride, self.pad\n", "\n", " dXs = []\n", " for X, dy, out_row, out_col in zip(Xs, dLdY, out_rows, out_cols):\n", " n_ex, in_rows, in_cols, nc_in = X.shape\n", " X_pad, (pr1, pr2, pc1, pc2) = pad2D(X, p, self.kernel_shape, s)\n", "\n", " dX = np.zeros_like(X_pad)\n", " for m in range(n_ex):\n", " for i in range(out_row):\n", " for j in range(out_col):\n", " for c in range(self.out_ch):\n", " # calculate window boundaries, incorporating stride\n", " i0, i1 = i * s, (i * s) + fr\n", " j0, j1 = j * s, (j * s) + fc\n", "\n", " if self.mode == \"max\":\n", " xi = X[m, i0:i1, j0:j1, c]\n", "\n", " # enforce that the mask can only consist of a\n", " # single `True` entry, even if multiple entries in\n", " # xi are equal to max(xi)\n", " mask = np.zeros_like(xi).astype(bool)\n", " x, y = np.argwhere(xi == np.max(xi))[0]\n", " mask[x, y] = True\n", "\n", " dX[m, i0:i1, j0:j1, c] += mask * dy[m, i, j, c]\n", " elif self.mode == \"average\":\n", " frame = np.ones((fr, fc)) * dy[m, i, j, c]\n", " dX[m, i0:i1, j0:j1, c] += frame / np.prod((fr, fc))\n", "\n", " pr2 = None if pr2 == 0 else -pr2\n", " pc2 = None if pc2 == 0 else -pc2\n", " dXs.append(dX[:, pr1:pr2, pc1:pc2, :])\n", " return dXs[0] if len(Xs) == 1 else dXs\n", "\n", "\n", "class Deconv2D(LayerBase):\n", " def __init__(\n", " self,\n", " out_ch,\n", " kernel_shape,\n", " pad=0,\n", " stride=1,\n", " act_fn=None,\n", " optimizer=None,\n", " init=\"glorot_uniform\",\n", " ):\n", " \"\"\"\n", " Apply a two-dimensional \"deconvolution\" to an input volume.\n", "\n", " Notes\n", " -----\n", " The term \"deconvolution\" in this context does not correspond with the\n", " deconvolution operation in mathematics. More accurately, this layer is\n", " computing a transposed convolution / fractionally-strided convolution.\n", "\n", " Parameters\n", " ----------\n", " out_ch : int\n", " The number of filters/kernels to compute in the current layer\n", " kernel_shape : 2-tuple\n", " The dimension of a single 2D filter/kernel in the current layer\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``Y[t]``. If None, use\n", " :class:`~numpy_ml.neural_nets.activations.Affine`\n", " activations by default. Default is None.\n", " pad : int, tuple, or 'same'\n", " The number of rows/columns to zero-pad the input with. Default is 0.\n", " stride : int\n", " The stride/hop of the convolution kernels as they move over the\n", " input volume. Default is 1.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.pad = pad\n", " self.init = init\n", " self.in_ch = None\n", " self.stride = stride\n", " self.out_ch = out_ch\n", " self.kernel_shape = kernel_shape\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.parameters = {\"W\": None, \"b\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " fr, fc = self.kernel_shape\n", " W = init_weights((fr, fc, self.in_ch, self.out_ch))\n", " b = np.zeros((1, 1, 1, self.out_ch))\n", "\n", " self.parameters = {\"W\": W, \"b\": b}\n", " self.gradients = {\"W\": np.zeros_like(W), \"b\": np.zeros_like(b)}\n", " self.derived_variables = {\"Z\": [], \"out_rows\": [], \"out_cols\": []}\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"Deconv2D\",\n", " \"pad\": self.pad,\n", " \"init\": self.init,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"stride\": self.stride,\n", " \"act_fn\": str(self.act_fn),\n", " \"kernel_shape\": self.kernel_shape,\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output given input volume `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The input volume consisting of `n_ex` examples, each with dimension\n", " (`in_rows`, `in_cols`, `in_ch`).\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The layer output.\n", " \"\"\" # noqa: E501\n", " if not self.is_initialized:\n", " self.in_ch = X.shape[3]\n", " self._init_params()\n", "\n", " W = self.parameters[\"W\"]\n", " b = self.parameters[\"b\"]\n", "\n", " s, p = self.stride, self.pad\n", " n_ex, in_rows, in_cols, in_ch = X.shape\n", "\n", " # pad the input and perform the forward deconvolution\n", " Z = deconv2D_naive(X, W, s, p, 0) + b\n", " Y = self.act_fn(Z)\n", "\n", " if retain_derived:\n", " self.X.append(X)\n", " self.derived_variables[\"Z\"].append(Z)\n", " self.derived_variables[\"out_rows\"].append(Z.shape[1])\n", " self.derived_variables[\"out_cols\"].append(Z.shape[2])\n", "\n", " return Y\n", "\n", " def backward(self, dLdY, retain_grads=True):\n", " \"\"\"\n", " Compute the gradient of the loss with respect to the layer parameters.\n", "\n", " Notes\n", " -----\n", " Relies on :meth:`~numpy_ml.neural_nets.utils.im2col` and\n", " :meth:`~numpy_ml.neural_nets.utils.col2im` to vectorize the\n", " gradient calculations.\n", "\n", " Parameters\n", " ----------\n", " dLdY : :py:class:`ndarray ` of shape (`n_ex, out_rows, out_cols, out_ch`)\n", " The gradient of the loss with respect to the layer output.\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape (`n_ex, in_rows, in_cols, in_ch`)\n", " The gradient of the loss with respect to the layer input volume.\n", " \"\"\" # noqa: E501\n", " assert self.trainable, \"Layer is frozen\"\n", " if not isinstance(dLdY, list):\n", " dLdY = [dLdY]\n", "\n", " dX = []\n", " X, Z = self.X, self.derived_variables[\"Z\"]\n", "\n", " for dy, x, z in zip(dLdY, X, Z):\n", " dx, dw, db = self._bwd(dy, x, z)\n", " dX.append(dx)\n", "\n", " if retain_grads:\n", " self.gradients[\"W\"] += dw\n", " self.gradients[\"b\"] += db\n", "\n", " return dX[0] if len(X) == 1 else dX\n", "\n", " def _bwd(self, dLdY, X, Z):\n", " \"\"\"Actual computation of gradient of the loss wrt. X, W, and b\"\"\"\n", " W = np.rot90(self.parameters[\"W\"], 2)\n", "\n", " s = self.stride\n", " if self.stride > 1:\n", " X = dilate(X, s - 1)\n", " s = 1\n", "\n", " fr, fc, in_ch, out_ch = W.shape\n", " (fr, fc), p = self.kernel_shape, self.pad\n", " n_ex, out_rows, out_cols, out_ch = dLdY.shape\n", "\n", " # pad X the first time\n", " X_pad, p = pad2D(X, p, W.shape[:2], s)\n", " n_ex, in_rows, in_cols, in_ch = X_pad.shape\n", " pr1, pr2, pc1, pc2 = p\n", "\n", " # compute additional padding to produce the deconvolution\n", " out_rows = s * (in_rows - 1) - pr1 - pr2 + fr\n", " out_cols = s * (in_cols - 1) - pc1 - pc2 + fc\n", " out_dim = (out_rows, out_cols)\n", "\n", " # add additional \"deconvolution\" padding\n", " _p = calc_pad_dims_2D(X_pad.shape, out_dim, W.shape[:2], s, 0)\n", " X_pad, _ = pad2D(X_pad, _p, W.shape[:2], s)\n", "\n", " # columnize W, X, and dLdY\n", " dLdZ = dLdY * self.act_fn.grad(Z)\n", " dLdZ, _ = pad2D(dLdZ, p, W.shape[:2], s)\n", "\n", " dLdZ_col = dLdZ.transpose(3, 1, 2, 0).reshape(out_ch, -1)\n", " W_col = W.transpose(3, 2, 0, 1).reshape(out_ch, -1)\n", " X_col, _ = im2col(X_pad, W.shape, 0, s, 0)\n", "\n", " # compute gradients via matrix multiplication and reshape\n", " dB = dLdZ_col.sum(axis=1).reshape(1, 1, 1, -1)\n", " dW = (dLdZ_col @ X_col.T).reshape(out_ch, in_ch, fr, fc).transpose(2, 3, 1, 0)\n", " dW = np.rot90(dW, 2)\n", "\n", " # reshape columnized dX back into the same format as the input volume\n", " dX_col = W_col.T @ dLdZ_col\n", "\n", " total_pad = tuple(i + j for i, j in zip(p, _p))\n", " dX = col2im(dX_col, X.shape, W.shape, total_pad, s, 0).transpose(0, 2, 3, 1)\n", " dX = dX[:, :: self.stride, :: self.stride, :]\n", "\n", " return dX, dW, dB\n", "\n", "\n", "#######################################################################\n", "# Recurrent Layers #\n", "#######################################################################\n", "\n", "\n", "class RNNCell(LayerBase):\n", " def __init__(self, n_out, act_fn=\"Tanh\", init=\"glorot_uniform\", optimizer=None):\n", " r\"\"\"\n", " A single step of a vanilla (Elman) RNN.\n", "\n", " Notes\n", " -----\n", " At timestep `t`, the vanilla RNN cell computes\n", "\n", " .. math::\n", "\n", " \\mathbf{Z}^{(t)} &=\n", " \\mathbf{W}_{ax} \\mathbf{X}^{(t)} + \\mathbf{b}_{ax} +\n", " \\mathbf{W}_{aa} \\mathbf{A}^{(t-1)} + \\mathbf{b}_{aa} \\\\\n", " \\mathbf{A}^{(t)} &= f(\\mathbf{Z}^{(t)})\n", "\n", " where\n", "\n", " - :math:`\\mathbf{X}^{(t)}` is the input at time `t`\n", " - :math:`\\mathbf{A}^{(t)}` is the hidden state at timestep `t`\n", " - `f` is the layer activation function\n", " - :math:`\\mathbf{W}_{ax}` and :math:`\\mathbf{b}_{ax}` are the weights\n", " and bias for the input to hidden layer\n", " - :math:`\\mathbf{W}_{aa}` and :math:`\\mathbf{b}_{aa}` are the weights\n", " and biases for the hidden to hidden layer\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimension of a single hidden state / output on a given timestep\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``A[t]``. Default is `'Tanh'`.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with default\n", " parameters. Default is None.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.n_timesteps = None\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.parameters = {\"Waa\": None, \"Wax\": None, \"ba\": None, \"bx\": None}\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self.X = []\n", " init_weights = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " Wax = init_weights((self.n_in, self.n_out))\n", " Waa = init_weights((self.n_out, self.n_out))\n", " ba = np.zeros((self.n_out, 1))\n", " bx = np.zeros((self.n_out, 1))\n", "\n", " self.parameters = {\"Waa\": Waa, \"Wax\": Wax, \"ba\": ba, \"bx\": bx}\n", "\n", " self.gradients = {\n", " \"Waa\": np.zeros_like(Waa),\n", " \"Wax\": np.zeros_like(Wax),\n", " \"ba\": np.zeros_like(ba),\n", " \"bx\": np.zeros_like(bx),\n", " }\n", "\n", " self.derived_variables = {\n", " \"A\": [],\n", " \"Z\": [],\n", " \"n_timesteps\": 0,\n", " \"current_step\": 0,\n", " \"dLdA_accumulator\": None,\n", " }\n", "\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"RNNCell\",\n", " \"init\": self.init,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, Xt):\n", " \"\"\"\n", " Compute the network output for a single timestep.\n", "\n", " Parameters\n", " ----------\n", " Xt : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Input at timestep `t` consisting of `n_ex` examples each of\n", " dimensionality `n_in`.\n", "\n", " Returns\n", " -------\n", " At: :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " The value of the hidden state at timestep `t` for each of the\n", " `n_ex` examples.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = Xt.shape[1]\n", " self._init_params()\n", "\n", " # increment timestep\n", " self.derived_variables[\"n_timesteps\"] += 1\n", " self.derived_variables[\"current_step\"] += 1\n", "\n", " # Retrieve parameters\n", " ba = self.parameters[\"ba\"]\n", " bx = self.parameters[\"bx\"]\n", " Wax = self.parameters[\"Wax\"]\n", " Waa = self.parameters[\"Waa\"]\n", "\n", " # initialize the hidden state to zero\n", " As = self.derived_variables[\"A\"]\n", " if len(As) == 0:\n", " n_ex, n_in = Xt.shape\n", " A0 = np.zeros((n_ex, self.n_out))\n", " As.append(A0)\n", "\n", " # compute next hidden state\n", " Zt = As[-1] @ Waa + ba.T + Xt @ Wax + bx.T\n", " At = self.act_fn(Zt)\n", "\n", " self.derived_variables[\"Z\"].append(Zt)\n", " self.derived_variables[\"A\"].append(At)\n", "\n", " # store intermediate variables\n", " self.X.append(Xt)\n", " return At\n", "\n", " def backward(self, dLdAt):\n", " \"\"\"\n", " Backprop for a single timestep.\n", "\n", " Parameters\n", " ----------\n", " dLdAt : :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " The gradient of the loss wrt. the layer outputs (ie., hidden\n", " states) at timestep `t`.\n", "\n", " Returns\n", " -------\n", " dLdXt : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer inputs at timestep `t`.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", "\n", " # decrement current step\n", " self.derived_variables[\"current_step\"] -= 1\n", "\n", " # extract context variables\n", " Zs = self.derived_variables[\"Z\"]\n", " As = self.derived_variables[\"A\"]\n", " t = self.derived_variables[\"current_step\"]\n", " dA_acc = self.derived_variables[\"dLdA_accumulator\"]\n", "\n", " # initialize accumulator\n", " if dA_acc is None:\n", " dA_acc = np.zeros_like(As[0])\n", "\n", " # get network weights for gradient calcs\n", " Wax = self.parameters[\"Wax\"]\n", " Waa = self.parameters[\"Waa\"]\n", "\n", " # compute gradient components at timestep t\n", " dA = dLdAt + dA_acc\n", " dZ = self.act_fn.grad(Zs[t]) * dA\n", " dXt = dZ @ Wax.T\n", "\n", " # update parameter gradients with signal from current step\n", " self.gradients[\"Waa\"] += As[t].T @ dZ\n", " self.gradients[\"Wax\"] += self.X[t].T @ dZ\n", " self.gradients[\"ba\"] += dZ.sum(axis=0, keepdims=True).T\n", " self.gradients[\"bx\"] += dZ.sum(axis=0, keepdims=True).T\n", "\n", " # update accumulator variable for hidden state\n", " self.derived_variables[\"dLdA_accumulator\"] = dZ @ Waa.T\n", " return dXt\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the layer's derived variables and gradients.\"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", "\n", " self.X = []\n", " for k, v in self.derived_variables.items():\n", " self.derived_variables[k] = []\n", "\n", " self.derived_variables[\"n_timesteps\"] = 0\n", " self.derived_variables[\"current_step\"] = 0\n", "\n", " # reset parameter gradients to 0\n", " for k, v in self.parameters.items():\n", " self.gradients[k] = np.zeros_like(v)\n", "\n", "\n", "class LSTMCell(LayerBase):\n", " def __init__(\n", " self,\n", " n_out,\n", " act_fn=\"Tanh\",\n", " gate_fn=\"Sigmoid\",\n", " init=\"glorot_uniform\",\n", " optimizer=None,\n", " ):\n", " \"\"\"\n", " A single step of a long short-term memory (LSTM) RNN.\n", "\n", " Notes\n", " -----\n", " Notation:\n", "\n", " - ``Z[t]`` is the input to each of the gates at timestep `t`\n", " - ``A[t]`` is the value of the hidden state at timestep `t`\n", " - ``Cc[t]`` is the value of the *candidate* cell/memory state at timestep `t`\n", " - ``C[t]`` is the value of the *final* cell/memory state at timestep `t`\n", " - ``Gf[t]`` is the output of the forget gate at timestep `t`\n", " - ``Gu[t]`` is the output of the update gate at timestep `t`\n", " - ``Go[t]`` is the output of the output gate at timestep `t`\n", "\n", " Equations::\n", "\n", " Z[t] = stack([A[t-1], X[t]])\n", " Gf[t] = gate_fn(Wf @ Z[t] + bf)\n", " Gu[t] = gate_fn(Wu @ Z[t] + bu)\n", " Go[t] = gate_fn(Wo @ Z[t] + bo)\n", " Cc[t] = act_fn(Wc @ Z[t] + bc)\n", " C[t] = Gf[t] * C[t-1] + Gu[t] * Cc[t]\n", " A[t] = Go[t] * act_fn(C[t])\n", "\n", " where `@` indicates dot/matrix product, and '*' indicates elementwise\n", " multiplication.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimension of a single hidden state / output on a given timestep.\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``A[t]``. Default is\n", " `'Tanh'`.\n", " gate_fn : str, :doc:`Activation ` object, or None\n", " The gate function for computing the update, forget, and output\n", " gates. Default is `'Sigmoid'`.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with default\n", " parameters. Default is None.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.n_timesteps = None\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.gate_fn = ActivationInitializer(gate_fn)()\n", " self.parameters = {\n", " \"Wf\": None,\n", " \"Wu\": None,\n", " \"Wc\": None,\n", " \"Wo\": None,\n", " \"bf\": None,\n", " \"bu\": None,\n", " \"bc\": None,\n", " \"bo\": None,\n", " }\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self.X = []\n", " init_weights_gate = WeightInitializer(str(self.gate_fn), mode=self.init)\n", " init_weights_act = WeightInitializer(str(self.act_fn), mode=self.init)\n", "\n", " Wf = init_weights_gate((self.n_in + self.n_out, self.n_out))\n", " Wu = init_weights_gate((self.n_in + self.n_out, self.n_out))\n", " Wc = init_weights_act((self.n_in + self.n_out, self.n_out))\n", " Wo = init_weights_gate((self.n_in + self.n_out, self.n_out))\n", "\n", " bf = np.zeros((1, self.n_out))\n", " bu = np.zeros((1, self.n_out))\n", " bc = np.zeros((1, self.n_out))\n", " bo = np.zeros((1, self.n_out))\n", "\n", " self.parameters = {\n", " \"Wf\": Wf,\n", " \"Wu\": Wu,\n", " \"Wc\": Wc,\n", " \"Wo\": Wo,\n", " \"bf\": bf,\n", " \"bu\": bu,\n", " \"bc\": bc,\n", " \"bo\": bo,\n", " }\n", "\n", " self.gradients = {\n", " \"Wf\": np.zeros_like(Wf),\n", " \"Wu\": np.zeros_like(Wu),\n", " \"Wc\": np.zeros_like(Wc),\n", " \"Wo\": np.zeros_like(Wo),\n", " \"bf\": np.zeros_like(bf),\n", " \"bu\": np.zeros_like(bu),\n", " \"bc\": np.zeros_like(bc),\n", " \"bo\": np.zeros_like(bo),\n", " }\n", "\n", " self.derived_variables = {\n", " \"C\": [],\n", " \"A\": [],\n", " \"Gf\": [],\n", " \"Gu\": [],\n", " \"Go\": [],\n", " \"Gc\": [],\n", " \"Cc\": [],\n", " \"n_timesteps\": 0,\n", " \"current_step\": 0,\n", " \"dLdA_accumulator\": None,\n", " \"dLdC_accumulator\": None,\n", " }\n", "\n", " self.is_initialized = True\n", "\n", " def _get_params(self):\n", " Wf = self.parameters[\"Wf\"]\n", " Wu = self.parameters[\"Wu\"]\n", " Wc = self.parameters[\"Wc\"]\n", " Wo = self.parameters[\"Wo\"]\n", " bf = self.parameters[\"bf\"]\n", " bu = self.parameters[\"bu\"]\n", " bc = self.parameters[\"bc\"]\n", " bo = self.parameters[\"bo\"]\n", " return Wf, Wu, Wc, Wo, bf, bu, bc, bo\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"LSTMCell\",\n", " \"init\": self.init,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"act_fn\": str(self.act_fn),\n", " \"gate_fn\": str(self.gate_fn),\n", " \"optimizer\": {\n", " \"cache\": self.optimizer.cache,\n", " \"hyperparameters\": self.optimizer.hyperparameters,\n", " },\n", " }\n", "\n", " def forward(self, Xt):\n", " \"\"\"\n", " Compute the layer output for a single timestep.\n", "\n", " Parameters\n", " ----------\n", " Xt : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Input at timestep t consisting of `n_ex` examples each of\n", " dimensionality `n_in`.\n", "\n", " Returns\n", " -------\n", " At: :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " The value of the hidden state at timestep `t` for each of the `n_ex`\n", " examples.\n", " Ct: :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " The value of the cell/memory state at timestep `t` for each of the\n", " `n_ex` examples.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = Xt.shape[1]\n", " self._init_params()\n", "\n", " Wf, Wu, Wc, Wo, bf, bu, bc, bo = self._get_params()\n", "\n", " self.derived_variables[\"n_timesteps\"] += 1\n", " self.derived_variables[\"current_step\"] += 1\n", "\n", " if len(self.derived_variables[\"A\"]) == 0:\n", " n_ex, n_in = Xt.shape\n", " init = np.zeros((n_ex, self.n_out))\n", " self.derived_variables[\"A\"].append(init)\n", " self.derived_variables[\"C\"].append(init)\n", "\n", " A_prev = self.derived_variables[\"A\"][-1]\n", " C_prev = self.derived_variables[\"C\"][-1]\n", "\n", " # concatenate A_prev and Xt to create Zt\n", " Zt = np.hstack([A_prev, Xt])\n", "\n", " Gft = self.gate_fn(Zt @ Wf + bf)\n", " Gut = self.gate_fn(Zt @ Wu + bu)\n", " Got = self.gate_fn(Zt @ Wo + bo)\n", " Cct = self.act_fn(Zt @ Wc + bc)\n", " Ct = Gft * C_prev + Gut * Cct\n", " At = Got * self.act_fn(Ct)\n", "\n", " # bookkeeping\n", " self.X.append(Xt)\n", " self.derived_variables[\"A\"].append(At)\n", " self.derived_variables[\"C\"].append(Ct)\n", " self.derived_variables[\"Gf\"].append(Gft)\n", " self.derived_variables[\"Gu\"].append(Gut)\n", " self.derived_variables[\"Go\"].append(Got)\n", " self.derived_variables[\"Cc\"].append(Cct)\n", " return At, Ct\n", "\n", " def backward(self, dLdAt):\n", " \"\"\"\n", " Backprop for a single timestep.\n", "\n", " Parameters\n", " ----------\n", " dLdAt : :py:class:`ndarray ` of shape `(n_ex, n_out)`\n", " The gradient of the loss wrt. the layer outputs (ie., hidden\n", " states) at timestep `t`.\n", "\n", " Returns\n", " -------\n", " dLdXt : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " The gradient of the loss wrt. the layer inputs at timestep `t`.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", "\n", " Wf, Wu, Wc, Wo, bf, bu, bc, bo = self._get_params()\n", "\n", " self.derived_variables[\"current_step\"] -= 1\n", " t = self.derived_variables[\"current_step\"]\n", "\n", " Got = self.derived_variables[\"Go\"][t]\n", " Gft = self.derived_variables[\"Gf\"][t]\n", " Gut = self.derived_variables[\"Gu\"][t]\n", " Cct = self.derived_variables[\"Cc\"][t]\n", " At = self.derived_variables[\"A\"][t + 1]\n", " Ct = self.derived_variables[\"C\"][t + 1]\n", " C_prev = self.derived_variables[\"C\"][t]\n", " A_prev = self.derived_variables[\"A\"][t]\n", "\n", " Xt = self.X[t]\n", " Zt = np.hstack([A_prev, Xt])\n", "\n", " dA_acc = self.derived_variables[\"dLdA_accumulator\"]\n", " dC_acc = self.derived_variables[\"dLdC_accumulator\"]\n", "\n", " # initialize accumulators\n", " if dA_acc is None:\n", " dA_acc = np.zeros_like(At)\n", "\n", " if dC_acc is None:\n", " dC_acc = np.zeros_like(Ct)\n", "\n", " # Gradient calculations\n", " # ---------------------\n", "\n", " dA = dLdAt + dA_acc\n", " dC = dC_acc + dA * Got * self.act_fn.grad(Ct)\n", "\n", " # compute the input to the gate functions at timestep t\n", " _Go = Zt @ Wo + bo\n", " _Gf = Zt @ Wf + bf\n", " _Gu = Zt @ Wu + bu\n", " _Gc = Zt @ Wc + bc\n", "\n", " # compute gradients wrt the *input* to each gate\n", " dGot = dA * self.act_fn(Ct) * self.gate_fn.grad(_Go)\n", " dCct = dC * Gut * self.act_fn.grad(_Gc)\n", " dGut = dC * Cct * self.gate_fn.grad(_Gu)\n", " dGft = dC * C_prev * self.gate_fn.grad(_Gf)\n", "\n", " dZ = dGft @ Wf.T + dGut @ Wu.T + dCct @ Wc.T + dGot @ Wo.T\n", " dXt = dZ[:, self.n_out :]\n", "\n", " self.gradients[\"Wc\"] += Zt.T @ dCct\n", " self.gradients[\"Wu\"] += Zt.T @ dGut\n", " self.gradients[\"Wf\"] += Zt.T @ dGft\n", " self.gradients[\"Wo\"] += Zt.T @ dGot\n", " self.gradients[\"bo\"] += dGot.sum(axis=0, keepdims=True)\n", " self.gradients[\"bu\"] += dGut.sum(axis=0, keepdims=True)\n", " self.gradients[\"bf\"] += dGft.sum(axis=0, keepdims=True)\n", " self.gradients[\"bc\"] += dCct.sum(axis=0, keepdims=True)\n", "\n", " self.derived_variables[\"dLdA_accumulator\"] = dZ[:, : self.n_out]\n", " self.derived_variables[\"dLdC_accumulator\"] = Gft * dC\n", " return dXt\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the layer's derived variables and gradients.\"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", "\n", " self.X = []\n", " for k, v in self.derived_variables.items():\n", " self.derived_variables[k] = []\n", "\n", " self.derived_variables[\"n_timesteps\"] = 0\n", " self.derived_variables[\"current_step\"] = 0\n", "\n", " # reset parameter gradients to 0\n", " for k, v in self.parameters.items():\n", " self.gradients[k] = np.zeros_like(v)\n", "\n", "\n", "class RNN(LayerBase):\n", " def __init__(self, n_out, act_fn=\"Tanh\", init=\"glorot_uniform\", optimizer=None):\n", " \"\"\"\n", " A single vanilla (Elman)-RNN layer.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimension of a single hidden state / output on a given\n", " timestep.\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``A[t]``. Default is\n", " `'Tanh'`.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with default\n", " parameters. Default is None.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.n_timesteps = None\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self.cell = RNNCell(\n", " n_in=self.n_in,\n", " n_out=self.n_out,\n", " act_fn=self.act_fn,\n", " init=self.init,\n", " optimizer=self.optimizer,\n", " )\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"RNN\",\n", " \"init\": self.init,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": self.cell.hyperparameters[\"optimizer\"],\n", " }\n", "\n", " def forward(self, X):\n", " \"\"\"\n", " Run a forward pass across all timesteps in the input.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in, n_t)`\n", " Input consisting of `n_ex` examples each of dimensionality `n_in`\n", " and extending for `n_t` timesteps.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out, n_t)`\n", " The value of the hidden state for each of the `n_ex` examples\n", " across each of the `n_t` timesteps.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = X.shape[1]\n", " self._init_params()\n", "\n", " Y = []\n", " n_ex, n_in, n_t = X.shape\n", " for t in range(n_t):\n", " yt = self.cell.forward(X[:, :, t])\n", " Y.append(yt)\n", " return np.dstack(Y)\n", "\n", " def backward(self, dLdA):\n", " \"\"\"\n", " Run a backward pass across all timesteps in the input.\n", "\n", " Parameters\n", " ----------\n", " dLdA : :py:class:`ndarray ` of shape `(n_ex, n_out, n_t)`\n", " The gradient of the loss with respect to the layer output for each\n", " of the `n_ex` examples across all `n_t` timesteps.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in, n_t)`\n", " The value of the hidden state for each of the `n_ex` examples\n", " across each of the `n_t` timesteps.\n", " \"\"\"\n", " assert self.cell.trainable, \"Layer is frozen\"\n", " dLdX = []\n", " n_ex, n_out, n_t = dLdA.shape\n", " for t in reversed(range(n_t)):\n", " dLdXt = self.cell.backward(dLdA[:, :, t])\n", " dLdX.insert(0, dLdXt)\n", " dLdX = np.dstack(dLdX)\n", " return dLdX\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"\n", " Return a dictionary containing any intermediate variables computed\n", " during the forward / backward passes.\n", " \"\"\"\n", " return self.cell.derived_variables\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"\n", " Return a dictionary of the gradients computed during the backward\n", " pass\n", " \"\"\"\n", " return self.cell.gradients\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"Return a dictionary of the current layer parameters\"\"\"\n", " return self.cell.parameters\n", "\n", " def set_params(self, summary_dict):\n", " \"\"\"\n", " Set the layer parameters from a dictionary of values.\n", "\n", " Parameters\n", " ----------\n", " summary_dict : dict\n", " A dictionary of layer parameters and hyperparameters. If a required\n", " parameter or hyperparameter is not included within `summary_dict`,\n", " this method will use the value in the current layer's\n", " :meth:`summary` method.\n", "\n", " Returns\n", " -------\n", " layer : :doc:`Layer ` object\n", " The newly-initialized layer.\n", " \"\"\"\n", " self = super().set_params(summary_dict)\n", " return self.cell.set_parameters(summary_dict)\n", "\n", " def freeze(self):\n", " \"\"\"\n", " Freeze the layer parameters at their current values so they can no\n", " longer be updated.\n", " \"\"\"\n", " self.cell.freeze()\n", "\n", " def unfreeze(self):\n", " \"\"\"Unfreeze the layer parameters so they can be updated.\"\"\"\n", " self.cell.unfreeze()\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the layer's derived variables and gradients.\"\"\"\n", " self.cell.flush_gradients()\n", "\n", " def update(self):\n", " \"\"\"\n", " Update the layer parameters using the accrued gradients and layer\n", " optimizer. Flush all gradients once the update is complete.\n", " \"\"\"\n", " self.cell.update()\n", " self.flush_gradients()\n", "\n", "\n", "class LSTM(LayerBase):\n", " def __init__(\n", " self,\n", " n_out,\n", " act_fn=\"Tanh\",\n", " gate_fn=\"Sigmoid\",\n", " init=\"glorot_uniform\",\n", " optimizer=None,\n", " ):\n", " \"\"\"\n", " A single long short-term memory (LSTM) RNN layer.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimension of a single hidden state / output on a given timestep.\n", " act_fn : str, :doc:`Activation ` object, or None\n", " The activation function for computing ``A[t]``. Default is `'Tanh'`.\n", " gate_fn : str, :doc:`Activation ` object, or None\n", " The gate function for computing the update, forget, and output\n", " gates. Default is `'Sigmoid'`.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is `'glorot_uniform'`.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the :class:`SGD\n", " ` optimizer with\n", " default parameters. Default is None.\n", " \"\"\" # noqa: E501\n", " super().__init__(optimizer)\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.n_timesteps = None\n", " self.act_fn = ActivationInitializer(act_fn)()\n", " self.gate_fn = ActivationInitializer(gate_fn)()\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self.cell = LSTMCell(\n", " n_in=self.n_in,\n", " n_out=self.n_out,\n", " act_fn=self.act_fn,\n", " gate_fn=self.gate_fn,\n", " init=self.init,\n", " )\n", " self.is_initialized = True\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary containing the layer hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"LSTM\",\n", " \"init\": self.init,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"act_fn\": str(self.act_fn),\n", " \"gate_fn\": str(self.gate_fn),\n", " \"optimizer\": self.cell.hyperparameters[\"optimizer\"],\n", " }\n", "\n", " def forward(self, X):\n", " \"\"\"\n", " Run a forward pass across all timesteps in the input.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in, n_t)`\n", " Input consisting of `n_ex` examples each of dimensionality `n_in`\n", " and extending for `n_t` timesteps.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out, n_t)`\n", " The value of the hidden state for each of the `n_ex` examples\n", " across each of the `n_t` timesteps.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self.n_in = X.shape[1]\n", " self._init_params()\n", "\n", " Y = []\n", " n_ex, n_in, n_t = X.shape\n", " for t in range(n_t):\n", " yt, _ = self.cell.forward(X[:, :, t])\n", " Y.append(yt)\n", " return np.dstack(Y)\n", "\n", " def backward(self, dLdA):\n", " \"\"\"\n", " Run a backward pass across all timesteps in the input.\n", "\n", " Parameters\n", " ----------\n", " dLdA : :py:class:`ndarray ` of shape `(n_ex, n_out, n_t)`\n", " The gradient of the loss with respect to the layer output for each\n", " of the `n_ex` examples across all `n_t` timesteps.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape (`n_ex`, `n_in`, `n_t`)\n", " The value of the hidden state for each of the `n_ex` examples\n", " across each of the `n_t` timesteps.\n", " \"\"\" # noqa: E501\n", " assert self.cell.trainable, \"Layer is frozen\"\n", " dLdX = []\n", " n_ex, n_out, n_t = dLdA.shape\n", " for t in reversed(range(n_t)):\n", " dLdXt, _ = self.cell.backward(dLdA[:, :, t])\n", " dLdX.insert(0, dLdXt)\n", " dLdX = np.dstack(dLdX)\n", " return dLdX\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"\n", " Return a dictionary containing any intermediate variables computed\n", " during the forward / backward passes.\n", " \"\"\"\n", " return self.cell.derived_variables\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"\n", " Return a dictionary of the gradients computed during the backward\n", " pass\n", " \"\"\"\n", " return self.cell.gradients\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"Return a dictionary of the current layer parameters\"\"\"\n", " return self.cell.parameters\n", "\n", " def freeze(self):\n", " \"\"\"\n", " Freeze the layer parameters at their current values so they can no\n", " longer be updated.\n", " \"\"\"\n", " self.cell.freeze()\n", "\n", " def unfreeze(self):\n", " \"\"\"Unfreeze the layer parameters so they can be updated.\"\"\"\n", " self.cell.unfreeze()\n", "\n", " def set_params(self, summary_dict):\n", " \"\"\"\n", " Set the layer parameters from a dictionary of values.\n", "\n", " Parameters\n", " ----------\n", " summary_dict : dict\n", " A dictionary of layer parameters and hyperparameters. If a required\n", " parameter or hyperparameter is not included within `summary_dict`,\n", " this method will use the value in the current layer's\n", " :meth:`summary` method.\n", "\n", " Returns\n", " -------\n", " layer : :doc:`Layer ` object\n", " The newly-initialized layer.\n", " \"\"\"\n", " self = super().set_params(summary_dict)\n", " return self.cell.set_parameters(summary_dict)\n", "\n", " def flush_gradients(self):\n", " \"\"\"Erase all the layer's derived variables and gradients.\"\"\"\n", " self.cell.flush_gradients()\n", "\n", " def update(self):\n", " \"\"\"\n", " Update the layer parameters using the accrued gradients and layer\n", " optimizer. Flush all gradients once the update is complete.\n", " \"\"\"\n", " self.cell.update()\n", " self.flush_gradients()\n"]} {"path": "numpy_ml/neural_nets/utils/__init__.py", "content": ["\"\"\"\n", "Common neural network-specific helper functions.\n", "\n", "The ``neural_nets.utils` module contains neural network-specific helper\n", "functions, primarily for dealing with CNNs.\n", "\"\"\"\n", "\n", "from .utils import *\n"]} {"path": "numpy_ml/neural_nets/utils/utils.py", "content": ["import numpy as np\n", "\n", "#######################################################################\n", "# Training Utils #\n", "#######################################################################\n", "\n", "\n", "def minibatch(X, batchsize=256, shuffle=True):\n", " \"\"\"\n", " Compute the minibatch indices for a training dataset.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, \\*)`\n", " The dataset to divide into minibatches. Assumes the first dimension\n", " represents the number of training examples.\n", " batchsize : int\n", " The desired size of each minibatch. Note, however, that if ``X.shape[0] %\n", " batchsize > 0`` then the final batch will contain fewer than batchsize\n", " entries. Default is 256.\n", " shuffle : bool\n", " Whether to shuffle the entries in the dataset before dividing into\n", " minibatches. Default is True.\n", "\n", " Returns\n", " -------\n", " mb_generator : generator\n", " A generator which yields the indices into X for each batch\n", " n_batches: int\n", " The number of batches\n", " \"\"\"\n", " N = X.shape[0]\n", " ix = np.arange(N)\n", " n_batches = int(np.ceil(N / batchsize))\n", "\n", " if shuffle:\n", " np.random.shuffle(ix)\n", "\n", " def mb_generator():\n", " for i in range(n_batches):\n", " yield ix[i * batchsize : (i + 1) * batchsize]\n", "\n", " return mb_generator(), n_batches\n", "\n", "\n", "#######################################################################\n", "# Padding Utils #\n", "#######################################################################\n", "\n", "\n", "def calc_pad_dims_2D(X_shape, out_dim, kernel_shape, stride, dilation=0):\n", " \"\"\"\n", " Compute the padding necessary to ensure that convolving `X` with a 2D kernel\n", " of shape `kernel_shape` and stride `stride` produces outputs with dimension\n", " `out_dim`.\n", "\n", " Parameters\n", " ----------\n", " X_shape : tuple of `(n_ex, in_rows, in_cols, in_ch)`\n", " Dimensions of the input volume. Padding is applied to `in_rows` and\n", " `in_cols`.\n", " out_dim : tuple of `(out_rows, out_cols)`\n", " The desired dimension of an output example after applying the\n", " convolution.\n", " kernel_shape : 2-tuple\n", " The dimension of the 2D convolution kernel.\n", " stride : int\n", " The stride for the convolution kernel.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " padding_dims : 4-tuple\n", " Padding dims for `X`. Organized as (left, right, up, down)\n", " \"\"\"\n", " if not isinstance(X_shape, tuple):\n", " raise ValueError(\"`X_shape` must be of type tuple\")\n", "\n", " if not isinstance(out_dim, tuple):\n", " raise ValueError(\"`out_dim` must be of type tuple\")\n", "\n", " if not isinstance(kernel_shape, tuple):\n", " raise ValueError(\"`kernel_shape` must be of type tuple\")\n", "\n", " if not isinstance(stride, int):\n", " raise ValueError(\"`stride` must be of type int\")\n", "\n", " d = dilation\n", " fr, fc = kernel_shape\n", " out_rows, out_cols = out_dim\n", " n_ex, in_rows, in_cols, in_ch = X_shape\n", "\n", " # update effective filter shape based on dilation factor\n", " _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d\n", "\n", " pr = int((stride * (out_rows - 1) + _fr - in_rows) / 2)\n", " pc = int((stride * (out_cols - 1) + _fc - in_cols) / 2)\n", "\n", " out_rows1 = int(1 + (in_rows + 2 * pr - _fr) / stride)\n", " out_cols1 = int(1 + (in_cols + 2 * pc - _fc) / stride)\n", "\n", " # add asymmetric padding pixels to right / bottom\n", " pr1, pr2 = pr, pr\n", " if out_rows1 == out_rows - 1:\n", " pr1, pr2 = pr, pr + 1\n", " elif out_rows1 != out_rows:\n", " raise AssertionError\n", "\n", " pc1, pc2 = pc, pc\n", " if out_cols1 == out_cols - 1:\n", " pc1, pc2 = pc, pc + 1\n", " elif out_cols1 != out_cols:\n", " raise AssertionError\n", "\n", " if any(np.array([pr1, pr2, pc1, pc2]) < 0):\n", " raise ValueError(\n", " \"Padding cannot be less than 0. Got: {}\".format((pr1, pr2, pc1, pc2))\n", " )\n", " return (pr1, pr2, pc1, pc2)\n", "\n", "\n", "def calc_pad_dims_1D(X_shape, l_out, kernel_width, stride, dilation=0, causal=False):\n", " \"\"\"\n", " Compute the padding necessary to ensure that convolving `X` with a 1D kernel\n", " of shape `kernel_shape` and stride `stride` produces outputs with length\n", " `l_out`.\n", "\n", " Parameters\n", " ----------\n", " X_shape : tuple of `(n_ex, l_in, in_ch)`\n", " Dimensions of the input volume. Padding is applied on either side of\n", " `l_in`.\n", " l_out : int\n", " The desired length an output example after applying the convolution.\n", " kernel_width : int\n", " The width of the 1D convolution kernel.\n", " stride : int\n", " The stride for the convolution kernel.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", " causal : bool\n", " Whether to compute the padding dims for a regular or causal\n", " convolution. If causal, padding is added only to the left side of the\n", " sequence. Default is False.\n", "\n", " Returns\n", " -------\n", " padding_dims : 2-tuple\n", " Padding dims for X. Organized as (left, right)\n", " \"\"\"\n", " if not isinstance(X_shape, tuple):\n", " raise ValueError(\"`X_shape` must be of type tuple\")\n", "\n", " if not isinstance(l_out, int):\n", " raise ValueError(\"`l_out` must be of type int\")\n", "\n", " if not isinstance(kernel_width, int):\n", " raise ValueError(\"`kernel_width` must be of type int\")\n", "\n", " if not isinstance(stride, int):\n", " raise ValueError(\"`stride` must be of type int\")\n", "\n", " d = dilation\n", " fw = kernel_width\n", " n_ex, l_in, in_ch = X_shape\n", "\n", " # update effective filter shape based on dilation factor\n", " _fw = fw * (d + 1) - d\n", " total_pad = int((stride * (l_out - 1) + _fw - l_in))\n", "\n", " if not causal:\n", " pw = total_pad // 2\n", " l_out1 = int(1 + (l_in + 2 * pw - _fw) / stride)\n", "\n", " # add asymmetric padding pixels to right / bottom\n", " pw1, pw2 = pw, pw\n", " if l_out1 == l_out - 1:\n", " pw1, pw2 = pw, pw + 1\n", " elif l_out1 != l_out:\n", " raise AssertionError\n", "\n", " if causal:\n", " # if this is a causal convolution, only pad the left side of the\n", " # sequence\n", " pw1, pw2 = total_pad, 0\n", " l_out1 = int(1 + (l_in + total_pad - _fw) / stride)\n", " assert l_out1 == l_out\n", "\n", " if any(np.array([pw1, pw2]) < 0):\n", " raise ValueError(\"Padding cannot be less than 0. Got: {}\".format((pw1, pw2)))\n", " return (pw1, pw2)\n", "\n", "\n", "def pad1D(X, pad, kernel_width=None, stride=None, dilation=0):\n", " \"\"\"\n", " Zero-pad a 3D input volume `X` along the second dimension.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, l_in, in_ch)`\n", " Input volume. Padding is applied to `l_in`.\n", " pad : tuple, int, or {'same', 'causal'}\n", " The padding amount. If 'same', add padding to ensure that the output\n", " length of a 1D convolution with a kernel of `kernel_shape` and stride\n", " `stride` is the same as the input length. If 'causal' compute padding\n", " such that the output both has the same length as the input AND\n", " ``output[t]`` does not depend on ``input[t + 1:]``. If 2-tuple,\n", " specifies the number of padding columns to add on each side of the\n", " sequence.\n", " kernel_width : int\n", " The dimension of the 2D convolution kernel. Only relevant if p='same'\n", " or 'causal'. Default is None.\n", " stride : int\n", " The stride for the convolution kernel. Only relevant if p='same' or\n", " 'causal'. Default is None.\n", " dilation : int\n", " The dilation of the convolution kernel. Only relevant if p='same' or\n", " 'causal'. Default is None.\n", "\n", " Returns\n", " -------\n", " X_pad : :py:class:`ndarray ` of shape `(n_ex, padded_seq, in_channels)`\n", " The padded output volume\n", " p : 2-tuple\n", " The number of 0-padded columns added to the (left, right) of the sequences\n", " in `X`.\n", " \"\"\"\n", " p = pad\n", " if isinstance(p, int):\n", " p = (p, p)\n", "\n", " if isinstance(p, tuple):\n", " X_pad = np.pad(\n", " X,\n", " pad_width=((0, 0), (p[0], p[1]), (0, 0)),\n", " mode=\"constant\",\n", " constant_values=0,\n", " )\n", "\n", " # compute the correct padding dims for a 'same' or 'causal' convolution\n", " if p in [\"same\", \"causal\"] and kernel_width and stride:\n", " causal = p == \"causal\"\n", " p = calc_pad_dims_1D(\n", " X.shape, X.shape[1], kernel_width, stride, causal=causal, dilation=dilation\n", " )\n", " X_pad, p = pad1D(X, p)\n", "\n", " return X_pad, p\n", "\n", "\n", "def pad2D(X, pad, kernel_shape=None, stride=None, dilation=0):\n", " \"\"\"\n", " Zero-pad a 4D input volume `X` along the second and third dimensions.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume. Padding is applied to `in_rows` and `in_cols`.\n", " pad : tuple, int, or 'same'\n", " The padding amount. If 'same', add padding to ensure that the output of\n", " a 2D convolution with a kernel of `kernel_shape` and stride `stride`\n", " has the same dimensions as the input. If 2-tuple, specifies the number\n", " of padding rows and colums to add *on both sides* of the rows/columns\n", " in `X`. If 4-tuple, specifies the number of rows/columns to add to the\n", " top, bottom, left, and right of the input volume.\n", " kernel_shape : 2-tuple\n", " The dimension of the 2D convolution kernel. Only relevant if p='same'.\n", " Default is None.\n", " stride : int\n", " The stride for the convolution kernel. Only relevant if p='same'.\n", " Default is None.\n", " dilation : int\n", " The dilation of the convolution kernel. Only relevant if p='same'.\n", " Default is 0.\n", "\n", " Returns\n", " -------\n", " X_pad : :py:class:`ndarray ` of shape `(n_ex, padded_in_rows, padded_in_cols, in_channels)`\n", " The padded output volume.\n", " p : 4-tuple\n", " The number of 0-padded rows added to the (top, bottom, left, right) of\n", " `X`.\n", " \"\"\"\n", " p = pad\n", " if isinstance(p, int):\n", " p = (p, p, p, p)\n", "\n", " if isinstance(p, tuple):\n", " if len(p) == 2:\n", " p = (p[0], p[0], p[1], p[1])\n", "\n", " X_pad = np.pad(\n", " X,\n", " pad_width=((0, 0), (p[0], p[1]), (p[2], p[3]), (0, 0)),\n", " mode=\"constant\",\n", " constant_values=0,\n", " )\n", "\n", " # compute the correct padding dims for a 'same' convolution\n", " if p == \"same\" and kernel_shape and stride is not None:\n", " p = calc_pad_dims_2D(\n", " X.shape, X.shape[1:3], kernel_shape, stride, dilation=dilation\n", " )\n", " X_pad, p = pad2D(X, p)\n", " return X_pad, p\n", "\n", "\n", "def dilate(X, d):\n", " \"\"\"\n", " Dilate the 4D volume `X` by `d`.\n", "\n", " Notes\n", " -----\n", " For a visual depiction of a dilated convolution, see [1].\n", "\n", " References\n", " ----------\n", " .. [1] Dumoulin & Visin (2016). \"A guide to convolution arithmetic for deep\n", " learning.\" https://arxiv.org/pdf/1603.07285v1.pdf\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume.\n", " d : int\n", " The number of 0-rows to insert between each adjacent row + column in `X`.\n", "\n", " Returns\n", " -------\n", " Xd : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The dilated array where\n", "\n", " .. math::\n", "\n", " \\\\text{out_rows} &= \\\\text{in_rows} + d(\\\\text{in_rows} - 1) \\\\\\\\\n", " \\\\text{out_cols} &= \\\\text{in_cols} + d (\\\\text{in_cols} - 1)\n", " \"\"\"\n", " n_ex, in_rows, in_cols, n_in = X.shape\n", " r_ix = np.repeat(np.arange(1, in_rows), d)\n", " c_ix = np.repeat(np.arange(1, in_cols), d)\n", " Xd = np.insert(X, r_ix, 0, axis=1)\n", " Xd = np.insert(Xd, c_ix, 0, axis=2)\n", " return Xd\n", "\n", "\n", "#######################################################################\n", "# Convolution Arithmetic #\n", "#######################################################################\n", "\n", "\n", "def calc_fan(weight_shape):\n", " \"\"\"\n", " Compute the fan-in and fan-out for a weight matrix/volume.\n", "\n", " Parameters\n", " ----------\n", " weight_shape : tuple\n", " The dimensions of the weight matrix/volume. The final 2 entries must be\n", " `in_ch`, `out_ch`.\n", "\n", " Returns\n", " -------\n", " fan_in : int\n", " The number of input units in the weight tensor\n", " fan_out : int\n", " The number of output units in the weight tensor\n", " \"\"\"\n", " if len(weight_shape) == 2:\n", " fan_in, fan_out = weight_shape\n", " elif len(weight_shape) in [3, 4]:\n", " in_ch, out_ch = weight_shape[-2:]\n", " kernel_size = np.prod(weight_shape[:-2])\n", " fan_in, fan_out = in_ch * kernel_size, out_ch * kernel_size\n", " else:\n", " raise ValueError(\"Unrecognized weight dimension: {}\".format(weight_shape))\n", " return fan_in, fan_out\n", "\n", "\n", "def calc_conv_out_dims(X_shape, W_shape, stride=1, pad=0, dilation=0):\n", " \"\"\"\n", " Compute the dimension of the output volume for the specified convolution.\n", "\n", " Parameters\n", " ----------\n", " X_shape : 3-tuple or 4-tuple\n", " The dimensions of the input volume to the convolution. If 3-tuple,\n", " entries are expected to be (`n_ex`, `in_length`, `in_ch`). If 4-tuple,\n", " entries are expected to be (`n_ex`, `in_rows`, `in_cols`, `in_ch`).\n", " weight_shape : 3-tuple or 4-tuple\n", " The dimensions of the weight volume for the convolution. If 3-tuple,\n", " entries are expected to be (`f_len`, `in_ch`, `out_ch`). If 4-tuple,\n", " entries are expected to be (`fr`, `fc`, `in_ch`, `out_ch`).\n", " pad : tuple, int, or {'same', 'causal'}\n", " The padding amount. If 'same', add padding to ensure that the output\n", " length of a 1D convolution with a kernel of `kernel_shape` and stride\n", " `stride` is the same as the input length. If 'causal' compute padding\n", " such that the output both has the same length as the input AND\n", " ``output[t]`` does not depend on ``input[t + 1:]``. If 2-tuple, specifies the\n", " number of padding columns to add on each side of the sequence. Default\n", " is 0.\n", " stride : int\n", " The stride for the convolution kernel. Default is 1.\n", " dilation : int\n", " The dilation of the convolution kernel. Default is 0.\n", "\n", " Returns\n", " -------\n", " out_dims : 3-tuple or 4-tuple\n", " The dimensions of the output volume. If 3-tuple, entries are (`n_ex`,\n", " `out_length`, `out_ch`). If 4-tuple, entries are (`n_ex`, `out_rows`,\n", " `out_cols`, `out_ch`).\n", " \"\"\"\n", " dummy = np.zeros(X_shape)\n", " s, p, d = stride, pad, dilation\n", " if len(X_shape) == 3:\n", " _, p = pad1D(dummy, p)\n", " pw1, pw2 = p\n", " fw, in_ch, out_ch = W_shape\n", " n_ex, in_length, in_ch = X_shape\n", "\n", " _fw = fw * (d + 1) - d\n", " out_length = (in_length + pw1 + pw2 - _fw) // s + 1\n", " out_dims = (n_ex, out_length, out_ch)\n", "\n", " elif len(X_shape) == 4:\n", " _, p = pad2D(dummy, p)\n", " pr1, pr2, pc1, pc2 = p\n", " fr, fc, in_ch, out_ch = W_shape\n", " n_ex, in_rows, in_cols, in_ch = X_shape\n", "\n", " # adjust effective filter size to account for dilation\n", " _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d\n", " out_rows = (in_rows + pr1 + pr2 - _fr) // s + 1\n", " out_cols = (in_cols + pc1 + pc2 - _fc) // s + 1\n", " out_dims = (n_ex, out_rows, out_cols, out_ch)\n", " else:\n", " raise ValueError(\"Unrecognized number of input dims: {}\".format(len(X_shape)))\n", " return out_dims\n", "\n", "\n", "#######################################################################\n", "# Convolution Vectorization Utils #\n", "#######################################################################\n", "\n", "\n", "def _im2col_indices(X_shape, fr, fc, p, s, d=0):\n", " \"\"\"\n", " Helper function that computes indices into X in prep for columnization in\n", " :func:`im2col`.\n", "\n", " Code extended from Andrej Karpathy's `im2col.py`\n", " \"\"\"\n", " pr1, pr2, pc1, pc2 = p\n", " n_ex, n_in, in_rows, in_cols = X_shape\n", "\n", " # adjust effective filter size to account for dilation\n", " _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d\n", "\n", " out_rows = (in_rows + pr1 + pr2 - _fr) // s + 1\n", " out_cols = (in_cols + pc1 + pc2 - _fc) // s + 1\n", "\n", " if any([out_rows <= 0, out_cols <= 0]):\n", " raise ValueError(\n", " \"Dimension mismatch during convolution: \"\n", " \"out_rows = {}, out_cols = {}\".format(out_rows, out_cols)\n", " )\n", "\n", " # i1/j1 : row/col templates\n", " # i0/j0 : n. copies (len) and offsets (values) for row/col templates\n", " i0 = np.repeat(np.arange(fr), fc)\n", " i0 = np.tile(i0, n_in) * (d + 1)\n", " i1 = s * np.repeat(np.arange(out_rows), out_cols)\n", " j0 = np.tile(np.arange(fc), fr * n_in) * (d + 1)\n", " j1 = s * np.tile(np.arange(out_cols), out_rows)\n", "\n", " # i.shape = (fr * fc * n_in, out_height * out_width)\n", " # j.shape = (fr * fc * n_in, out_height * out_width)\n", " # k.shape = (fr * fc * n_in, 1)\n", " i = i0.reshape(-1, 1) + i1.reshape(1, -1)\n", " j = j0.reshape(-1, 1) + j1.reshape(1, -1)\n", " k = np.repeat(np.arange(n_in), fr * fc).reshape(-1, 1)\n", " return k, i, j\n", "\n", "\n", "def im2col(X, W_shape, pad, stride, dilation=0):\n", " \"\"\"\n", " Pads and rearrange overlapping windows of the input volume into column\n", " vectors, returning the concatenated padded vectors in a matrix `X_col`.\n", "\n", " Notes\n", " -----\n", " A NumPy reimagining of MATLAB's ``im2col`` 'sliding' function.\n", "\n", " Code extended from Andrej Karpathy's ``im2col.py``.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume (not padded).\n", " W_shape: 4-tuple containing `(kernel_rows, kernel_cols, in_ch, out_ch)`\n", " The dimensions of the weights/kernels in the present convolutional\n", " layer.\n", " pad : tuple, int, or 'same'\n", " The padding amount. If 'same', add padding to ensure that the output of\n", " a 2D convolution with a kernel of `kernel_shape` and stride `stride`\n", " produces an output volume of the same dimensions as the input. If\n", " 2-tuple, specifies the number of padding rows and colums to add *on both\n", " sides* of the rows/columns in X. If 4-tuple, specifies the number of\n", " rows/columns to add to the top, bottom, left, and right of the input\n", " volume.\n", " stride : int\n", " The stride of each convolution kernel\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " X_col : :py:class:`ndarray ` of shape (Q, Z)\n", " The reshaped input volume where where:\n", "\n", " .. math::\n", "\n", " Q &= \\\\text{kernel_rows} \\\\times \\\\text{kernel_cols} \\\\times \\\\text{n_in} \\\\\\\\\n", " Z &= \\\\text{n_ex} \\\\times \\\\text{out_rows} \\\\times \\\\text{out_cols}\n", " \"\"\"\n", " fr, fc, n_in, n_out = W_shape\n", " s, p, d = stride, pad, dilation\n", " n_ex, in_rows, in_cols, n_in = X.shape\n", "\n", " # zero-pad the input\n", " X_pad, p = pad2D(X, p, W_shape[:2], stride=s, dilation=d)\n", " pr1, pr2, pc1, pc2 = p\n", "\n", " # shuffle to have channels as the first dim\n", " X_pad = X_pad.transpose(0, 3, 1, 2)\n", "\n", " # get the indices for im2col\n", " k, i, j = _im2col_indices((n_ex, n_in, in_rows, in_cols), fr, fc, p, s, d)\n", "\n", " X_col = X_pad[:, k, i, j]\n", " X_col = X_col.transpose(1, 2, 0).reshape(fr * fc * n_in, -1)\n", " return X_col, p\n", "\n", "\n", "def col2im(X_col, X_shape, W_shape, pad, stride, dilation=0):\n", " \"\"\"\n", " Take columns of a 2D matrix and rearrange them into the blocks/windows of\n", " a 4D image volume.\n", "\n", " Notes\n", " -----\n", " A NumPy reimagining of MATLAB's ``col2im`` 'sliding' function.\n", "\n", " Code extended from Andrej Karpathy's ``im2col.py``.\n", "\n", " Parameters\n", " ----------\n", " X_col : :py:class:`ndarray ` of shape `(Q, Z)`\n", " The columnized version of `X` (assumed to include padding)\n", " X_shape : 4-tuple containing `(n_ex, in_rows, in_cols, in_ch)`\n", " The original dimensions of `X` (not including padding)\n", " W_shape: 4-tuple containing `(kernel_rows, kernel_cols, in_ch, out_ch)`\n", " The dimensions of the weights in the present convolutional layer\n", " pad : 4-tuple of `(left, right, up, down)`\n", " Number of zero-padding rows/cols to add to `X`\n", " stride : int\n", " The stride of each convolution kernel\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " img : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The reshaped `X_col` input matrix\n", " \"\"\"\n", " if not (isinstance(pad, tuple) and len(pad) == 4):\n", " raise TypeError(\"pad must be a 4-tuple, but got: {}\".format(pad))\n", "\n", " s, d = stride, dilation\n", " pr1, pr2, pc1, pc2 = pad\n", " fr, fc, n_in, n_out = W_shape\n", " n_ex, in_rows, in_cols, n_in = X_shape\n", "\n", " X_pad = np.zeros((n_ex, n_in, in_rows + pr1 + pr2, in_cols + pc1 + pc2))\n", " k, i, j = _im2col_indices((n_ex, n_in, in_rows, in_cols), fr, fc, pad, s, d)\n", "\n", " X_col_reshaped = X_col.reshape(n_in * fr * fc, -1, n_ex)\n", " X_col_reshaped = X_col_reshaped.transpose(2, 0, 1)\n", "\n", " np.add.at(X_pad, (slice(None), k, i, j), X_col_reshaped)\n", "\n", " pr2 = None if pr2 == 0 else -pr2\n", " pc2 = None if pc2 == 0 else -pc2\n", " return X_pad[:, :, pr1:pr2, pc1:pc2]\n", "\n", "\n", "#######################################################################\n", "# Convolution #\n", "#######################################################################\n", "\n", "\n", "def conv2D(X, W, stride, pad, dilation=0):\n", " \"\"\"\n", " A faster (but more memory intensive) implementation of the 2D \"convolution\"\n", " (technically, cross-correlation) of input `X` with a collection of kernels in\n", " `W`.\n", "\n", " Notes\n", " -----\n", " Relies on the :func:`im2col` function to perform the convolution as a single\n", " matrix multiplication.\n", "\n", " For a helpful diagram, see Pete Warden's 2015 blogpost [1].\n", "\n", " References\n", " ----------\n", " .. [1] Warden (2015). \"Why GEMM is at the heart of deep learning,\"\n", " https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume (unpadded).\n", " W: :py:class:`ndarray ` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)`\n", " A volume of convolution weights/kernels for a given layer.\n", " stride : int\n", " The stride of each convolution kernel.\n", " pad : tuple, int, or 'same'\n", " The padding amount. If 'same', add padding to ensure that the output of\n", " a 2D convolution with a kernel of `kernel_shape` and stride `stride`\n", " produces an output volume of the same dimensions as the input. If\n", " 2-tuple, specifies the number of padding rows and colums to add *on both\n", " sides* of the rows/columns in `X`. If 4-tuple, specifies the number of\n", " rows/columns to add to the top, bottom, left, and right of the input\n", " volume.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " Z : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The covolution of `X` with `W`.\n", " \"\"\"\n", " s, d = stride, dilation\n", " _, p = pad2D(X, pad, W.shape[:2], s, dilation=dilation)\n", "\n", " pr1, pr2, pc1, pc2 = p\n", " fr, fc, in_ch, out_ch = W.shape\n", " n_ex, in_rows, in_cols, in_ch = X.shape\n", "\n", " # update effective filter shape based on dilation factor\n", " _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d\n", "\n", " # compute the dimensions of the convolution output\n", " out_rows = int((in_rows + pr1 + pr2 - _fr) / s + 1)\n", " out_cols = int((in_cols + pc1 + pc2 - _fc) / s + 1)\n", "\n", " # convert X and W into the appropriate 2D matrices and take their product\n", " X_col, _ = im2col(X, W.shape, p, s, d)\n", " W_col = W.transpose(3, 2, 0, 1).reshape(out_ch, -1)\n", "\n", " Z = (W_col @ X_col).reshape(out_ch, out_rows, out_cols, n_ex).transpose(3, 1, 2, 0)\n", "\n", " return Z\n", "\n", "\n", "def conv1D(X, W, stride, pad, dilation=0):\n", " \"\"\"\n", " A faster (but more memory intensive) implementation of a 1D \"convolution\"\n", " (technically, cross-correlation) of input `X` with a collection of kernels in\n", " `W`.\n", "\n", " Notes\n", " -----\n", " Relies on the :func:`im2col` function to perform the convolution as a single\n", " matrix multiplication.\n", "\n", " For a helpful diagram, see Pete Warden's 2015 blogpost [1].\n", "\n", " References\n", " ----------\n", " .. [1] Warden (2015). \"Why GEMM is at the heart of deep learning,\"\n", " https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, l_in, in_ch)`\n", " Input volume (unpadded)\n", " W: :py:class:`ndarray ` of shape `(kernel_width, in_ch, out_ch)`\n", " A volume of convolution weights/kernels for a given layer\n", " stride : int\n", " The stride of each convolution kernel\n", " pad : tuple, int, or 'same'\n", " The padding amount. If 'same', add padding to ensure that the output of\n", " a 1D convolution with a kernel of `kernel_shape` and stride `stride`\n", " produces an output volume of the same dimensions as the input. If\n", " 2-tuple, specifies the number of padding colums to add *on both sides*\n", " of the columns in X.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " Z : :py:class:`ndarray ` of shape `(n_ex, l_out, out_ch)`\n", " The convolution of X with W.\n", " \"\"\"\n", " _, p = pad1D(X, pad, W.shape[0], stride, dilation=dilation)\n", "\n", " # add a row dimension to X to permit us to use im2col/col2im\n", " X2D = np.expand_dims(X, axis=1)\n", " W2D = np.expand_dims(W, axis=0)\n", " p2D = (0, 0, p[0], p[1])\n", " Z2D = conv2D(X2D, W2D, stride, p2D, dilation)\n", "\n", " # drop the row dimension\n", " return np.squeeze(Z2D, axis=1)\n", "\n", "\n", "def deconv2D_naive(X, W, stride, pad, dilation=0):\n", " \"\"\"\n", " Perform a \"deconvolution\" (more accurately, a transposed convolution) of an\n", " input volume `X` with a weight kernel `W`, incorporating stride, pad, and\n", " dilation.\n", "\n", " Notes\n", " -----\n", " Rather than using the transpose of the convolution matrix, this approach\n", " uses a direct convolution with zero padding, which, while conceptually\n", " straightforward, is computationally inefficient.\n", "\n", " For further explanation, see [1].\n", "\n", " References\n", " ----------\n", " .. [1] Dumoulin & Visin (2016). \"A guide to convolution arithmetic for deep\n", " learning.\" https://arxiv.org/pdf/1603.07285v1.pdf\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume (not padded)\n", " W: :py:class:`ndarray ` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)`\n", " A volume of convolution weights/kernels for a given layer\n", " stride : int\n", " The stride of each convolution kernel\n", " pad : tuple, int, or 'same'\n", " The padding amount. If 'same', add padding to ensure that the output of\n", " a 2D convolution with a kernel of `kernel_shape` and stride `stride`\n", " produces an output volume of the same dimensions as the input. If\n", " 2-tuple, specifies the number of padding rows and colums to add *on both\n", " sides* of the rows/columns in `X`. If 4-tuple, specifies the number of\n", " rows/columns to add to the top, bottom, left, and right of the input\n", " volume.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, n_out)`\n", " The decovolution of (padded) input volume `X` with `W` using stride `s` and\n", " dilation `d`.\n", " \"\"\"\n", " if stride > 1:\n", " X = dilate(X, stride - 1)\n", " stride = 1\n", "\n", " # pad the input\n", " X_pad, p = pad2D(X, pad, W.shape[:2], stride=stride, dilation=dilation)\n", "\n", " n_ex, in_rows, in_cols, n_in = X_pad.shape\n", " fr, fc, n_in, n_out = W.shape\n", " s, d = stride, dilation\n", " pr1, pr2, pc1, pc2 = p\n", "\n", " # update effective filter shape based on dilation factor\n", " _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d\n", "\n", " # compute deconvolution output dims\n", " out_rows = s * (in_rows - 1) - pr1 - pr2 + _fr\n", " out_cols = s * (in_cols - 1) - pc1 - pc2 + _fc\n", " out_dim = (out_rows, out_cols)\n", "\n", " # add additional padding to achieve the target output dim\n", " _p = calc_pad_dims_2D(X_pad.shape, out_dim, W.shape[:2], s, d)\n", " X_pad, pad = pad2D(X_pad, _p, W.shape[:2], stride=s, dilation=dilation)\n", "\n", " # perform the forward convolution using the flipped weight matrix (note\n", " # we set pad to 0, since we've already added padding)\n", " Z = conv2D(X_pad, np.rot90(W, 2), s, 0, d)\n", "\n", " pr2 = None if pr2 == 0 else -pr2\n", " pc2 = None if pc2 == 0 else -pc2\n", " return Z[:, pr1:pr2, pc1:pc2, :]\n", "\n", "\n", "def conv2D_naive(X, W, stride, pad, dilation=0):\n", " \"\"\"\n", " A slow but more straightforward implementation of a 2D \"convolution\"\n", " (technically, cross-correlation) of input `X` with a collection of kernels `W`.\n", "\n", " Notes\n", " -----\n", " This implementation uses ``for`` loops and direct indexing to perform the\n", " convolution. As a result, it is slower than the vectorized :func:`conv2D`\n", " function that relies on the :func:`col2im` and :func:`im2col`\n", " transformations.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " Input volume.\n", " W: :py:class:`ndarray ` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)`\n", " The volume of convolution weights/kernels.\n", " stride : int\n", " The stride of each convolution kernel.\n", " pad : tuple, int, or 'same'\n", " The padding amount. If 'same', add padding to ensure that the output of\n", " a 2D convolution with a kernel of `kernel_shape` and stride `stride`\n", " produces an output volume of the same dimensions as the input. If\n", " 2-tuple, specifies the number of padding rows and colums to add *on both\n", " sides* of the rows/columns in `X`. If 4-tuple, specifies the number of\n", " rows/columns to add to the top, bottom, left, and right of the input\n", " volume.\n", " dilation : int\n", " Number of pixels inserted between kernel elements. Default is 0.\n", "\n", " Returns\n", " -------\n", " Z : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The covolution of `X` with `W`.\n", " \"\"\"\n", " s, d = stride, dilation\n", " X_pad, p = pad2D(X, pad, W.shape[:2], stride=s, dilation=d)\n", "\n", " pr1, pr2, pc1, pc2 = p\n", " fr, fc, in_ch, out_ch = W.shape\n", " n_ex, in_rows, in_cols, in_ch = X.shape\n", "\n", " # update effective filter shape based on dilation factor\n", " fr, fc = fr * (d + 1) - d, fc * (d + 1) - d\n", "\n", " out_rows = int((in_rows + pr1 + pr2 - fr) / s + 1)\n", " out_cols = int((in_cols + pc1 + pc2 - fc) / s + 1)\n", "\n", " Z = np.zeros((n_ex, out_rows, out_cols, out_ch))\n", " for m in range(n_ex):\n", " for c in range(out_ch):\n", " for i in range(out_rows):\n", " for j in range(out_cols):\n", " i0, i1 = i * s, (i * s) + fr\n", " j0, j1 = j * s, (j * s) + fc\n", "\n", " window = X_pad[m, i0 : i1 : (d + 1), j0 : j1 : (d + 1), :]\n", " Z[m, i, j, c] = np.sum(window * W[:, :, :, c])\n", " return Z\n", "\n", "\n", "#######################################################################\n", "# Weight Initialization #\n", "#######################################################################\n", "\n", "\n", "def he_uniform(weight_shape):\n", " \"\"\"\n", " Initializes network weights `W` with using the He uniform initialization\n", " strategy.\n", "\n", " Notes\n", " -----\n", " The He uniform initializations trategy initializes thew eights in `W` using\n", " draws from Uniform(-b, b) where\n", "\n", " .. math::\n", "\n", " b = \\sqrt{\\\\frac{6}{\\\\text{fan_in}}}\n", "\n", " Developed for deep networks with ReLU nonlinearities.\n", "\n", " Parameters\n", " ----------\n", " weight_shape : tuple\n", " The dimensions of the weight matrix/volume.\n", "\n", " Returns\n", " -------\n", " W : :py:class:`ndarray ` of shape `weight_shape`\n", " The initialized weights.\n", " \"\"\"\n", " fan_in, fan_out = calc_fan(weight_shape)\n", " b = np.sqrt(6 / fan_in)\n", " return np.random.uniform(-b, b, size=weight_shape)\n", "\n", "\n", "def he_normal(weight_shape):\n", " \"\"\"\n", " Initialize network weights `W` using the He normal initialization strategy.\n", "\n", " Notes\n", " -----\n", " The He normal initialization strategy initializes the weights in `W` using\n", " draws from TruncatedNormal(0, b) where the variance `b` is\n", "\n", " .. math::\n", "\n", " b = \\\\frac{2}{\\\\text{fan_in}}\n", "\n", " He normal initialization was originally developed for deep networks with\n", " :class:`~numpy_ml.neural_nets.activations.ReLU` nonlinearities.\n", "\n", " Parameters\n", " ----------\n", " weight_shape : tuple\n", " The dimensions of the weight matrix/volume.\n", "\n", " Returns\n", " -------\n", " W : :py:class:`ndarray ` of shape `weight_shape`\n", " The initialized weights.\n", " \"\"\"\n", " fan_in, fan_out = calc_fan(weight_shape)\n", " std = np.sqrt(2 / fan_in)\n", " return truncated_normal(0, std, weight_shape)\n", "\n", "\n", "def glorot_uniform(weight_shape, gain=1.0):\n", " \"\"\"\n", " Initialize network weights `W` using the Glorot uniform initialization\n", " strategy.\n", "\n", " Notes\n", " -----\n", " The Glorot uniform initialization strategy initializes weights using draws\n", " from ``Uniform(-b, b)`` where:\n", "\n", " .. math::\n", "\n", " b = \\\\text{gain} \\sqrt{\\\\frac{6}{\\\\text{fan_in} + \\\\text{fan_out}}}\n", "\n", " The motivation for Glorot uniform initialization is to choose weights to\n", " ensure that the variance of the layer outputs are approximately equal to\n", " the variance of its inputs.\n", "\n", " This initialization strategy was primarily developed for deep networks with\n", " tanh and logistic sigmoid nonlinearities.\n", "\n", " Parameters\n", " ----------\n", " weight_shape : tuple\n", " The dimensions of the weight matrix/volume.\n", "\n", " Returns\n", " -------\n", " W : :py:class:`ndarray ` of shape `weight_shape`\n", " The initialized weights.\n", " \"\"\"\n", " fan_in, fan_out = calc_fan(weight_shape)\n", " b = gain * np.sqrt(6 / (fan_in + fan_out))\n", " return np.random.uniform(-b, b, size=weight_shape)\n", "\n", "\n", "def glorot_normal(weight_shape, gain=1.0):\n", " \"\"\"\n", " Initialize network weights `W` using the Glorot normal initialization strategy.\n", "\n", " Notes\n", " -----\n", " The Glorot normal initializaiton initializes weights with draws from\n", " TruncatedNormal(0, b) where the variance `b` is\n", "\n", " .. math::\n", "\n", " b = \\\\frac{2 \\\\text{gain}^2}{\\\\text{fan_in} + \\\\text{fan_out}}\n", "\n", " The motivation for Glorot normal initialization is to choose weights to\n", " ensure that the variance of the layer outputs are approximately equal to\n", " the variance of its inputs.\n", "\n", " This initialization strategy was primarily developed for deep networks with\n", " :class:`~numpy_ml.neural_nets.activations.Tanh` and\n", " :class:`~numpy_ml.neural_nets.activations.Sigmoid` nonlinearities.\n", "\n", " Parameters\n", " ----------\n", " weight_shape : tuple\n", " The dimensions of the weight matrix/volume.\n", "\n", " Returns\n", " -------\n", " W : :py:class:`ndarray ` of shape `weight_shape`\n", " The initialized weights.\n", " \"\"\"\n", " fan_in, fan_out = calc_fan(weight_shape)\n", " std = gain * np.sqrt(2 / (fan_in + fan_out))\n", " return truncated_normal(0, std, weight_shape)\n", "\n", "\n", "def truncated_normal(mean, std, out_shape):\n", " \"\"\"\n", " Generate draws from a truncated normal distribution via rejection sampling.\n", "\n", " Notes\n", " -----\n", " The rejection sampling regimen draws samples from a normal distribution\n", " with mean `mean` and standard deviation `std`, and resamples any values\n", " more than two standard deviations from `mean`.\n", "\n", " Parameters\n", " ----------\n", " mean : float or array_like of floats\n", " The mean/center of the distribution\n", " std : float or array_like of floats\n", " Standard deviation (spread or \"width\") of the distribution.\n", " out_shape : int or tuple of ints\n", " Output shape. If the given shape is, e.g., ``(m, n, k)``, then\n", " ``m * n * k`` samples are drawn.\n", "\n", " Returns\n", " -------\n", " samples : :py:class:`ndarray ` of shape `out_shape`\n", " Samples from the truncated normal distribution parameterized by `mean`\n", " and `std`.\n", " \"\"\"\n", " samples = np.random.normal(loc=mean, scale=std, size=out_shape)\n", " reject = np.logical_or(samples >= mean + 2 * std, samples <= mean - 2 * std)\n", " while any(reject.flatten()):\n", " resamples = np.random.normal(loc=mean, scale=std, size=reject.sum())\n", " samples[reject] = resamples\n", " reject = np.logical_or(samples >= mean + 2 * std, samples <= mean - 2 * std)\n", " return samples\n"]} {"path": "numpy_ml/neural_nets/models/__init__.py", "content": ["from .vae import *\n", "from .wgan_gp import *\n", "from .w2v import *\n"]} {"path": "numpy_ml/neural_nets/models/w2v.py", "content": ["from time import time\n", "\n", "import numpy as np\n", "\n", "from ..layers import Embedding\n", "from ..losses import NCELoss\n", "\n", "from ...preprocessing.nlp import Vocabulary, tokenize_words\n", "from ...utils.data_structures import DiscreteSampler\n", "\n", "\n", "class Word2Vec(object):\n", " def __init__(\n", " self,\n", " context_len=5,\n", " min_count=None,\n", " skip_gram=False,\n", " max_tokens=None,\n", " embedding_dim=300,\n", " filter_stopwords=True,\n", " noise_dist_power=0.75,\n", " init=\"glorot_uniform\",\n", " num_negative_samples=64,\n", " optimizer=\"SGD(lr=0.1)\",\n", " ):\n", " \"\"\"\n", " A word2vec model supporting both continuous bag of words (CBOW) and\n", " skip-gram architectures, with training via noise contrastive\n", " estimation.\n", "\n", " Parameters\n", " ----------\n", " context_len : int\n", " The number of words to the left and right of the current word to\n", " use as context during training. Larger values result in more\n", " training examples and thus can lead to higher accuracy at the\n", " expense of additional training time. Default is 5.\n", " min_count : int or None\n", " Minimum number of times a token must occur in order to be included\n", " in vocab. If None, include all tokens from `corpus_fp` in vocab.\n", " Default is None.\n", " skip_gram : bool\n", " Whether to train the skip-gram or CBOW model. The skip-gram model\n", " is trained to predict the target word i given its surrounding\n", " context, ``words[i - context:i]`` and ``words[i + 1:i + 1 +\n", " context]`` as input. Default is False.\n", " max_tokens : int or None\n", " Only add the first `max_tokens` most frequent tokens that occur\n", " more than `min_count` to the vocabulary. If None, add all tokens\n", " that occur more than than `min_count`. Default is None.\n", " embedding_dim : int\n", " The number of dimensions in the final word embeddings. Default is\n", " 300.\n", " filter_stopwords : bool\n", " Whether to remove stopwords before encoding the words in the\n", " corpus. Default is True.\n", " noise_dist_power : float\n", " The power the unigram count is raised to when computing the noise\n", " distribution for negative sampling. A value of 0 corresponds to a\n", " uniform distribution over tokens, and a value of 1 corresponds to a\n", " distribution proportional to the token unigram counts. Default is\n", " 0.75.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is 'glorot_uniform'.\n", " num_negative_samples: int\n", " The number of negative samples to draw from the noise distribution\n", " for each positive training sample. If 0, use the hierarchical\n", " softmax formulation of the model instead. Default is 5.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the `update` method. If None, use the\n", " :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with\n", " default parameters. Default is None.\n", "\n", " Attributes\n", " ----------\n", " parameters : dict\n", " hyperparameters : dict\n", " derived_variables : dict\n", " gradients : dict\n", "\n", " Notes\n", " -----\n", " The word2vec model is outlined in in [1].\n", "\n", " CBOW architecture::\n", "\n", " w_{t-R} ----|\n", " w_{t-R+1} ----|\n", " ... --> Average --> Embedding layer --> [NCE Layer / HSoftmax] --> P(w_{t} | w_{...})\n", " w_{t+R-1} ----|\n", " w_{t+R} ----|\n", "\n", " Skip-gram architecture::\n", "\n", " |--> P(w_{t-R} | w_{t})\n", " |--> P(w_{t-R+1} | w_{t})\n", " w_{t} --> Embedding layer --> [NCE Layer / HSoftmax] --| ...\n", " |--> P(w_{t+R-1} | w_{t})\n", " |--> P(w_{t+R} | w_{t})\n", "\n", " where :math:`w_{i}` is the one-hot representation of the word at position\n", " `i` within a sentence in the corpus and `R` is the length of the context\n", " window on either side of the target word.\n", "\n", " References\n", " ----------\n", " .. [1] Mikolov et al. (2013). \"Distributed representations of words\n", " and phrases and their compositionality,\" Proceedings of the 26th\n", " International Conference on Neural Information Processing Systems.\n", " https://arxiv.org/pdf/1310.4546.pdf\n", " \"\"\"\n", " self.init = init\n", " self.optimizer = optimizer\n", " self.skip_gram = skip_gram\n", " self.min_count = min_count\n", " self.max_tokens = max_tokens\n", " self.context_len = context_len\n", " self.embedding_dim = embedding_dim\n", " self.filter_stopwords = filter_stopwords\n", " self.noise_dist_power = noise_dist_power\n", " self.num_negative_samples = num_negative_samples\n", " self.special_chars = set([\"\", \"\", \"\"])\n", "\n", " def _init_params(self):\n", " self._dv = {}\n", " self._build_noise_distribution()\n", "\n", " self.embeddings = Embedding(\n", " init=self.init,\n", " vocab_size=self.vocab_size,\n", " n_out=self.embedding_dim,\n", " optimizer=self.optimizer,\n", " pool=None if self.skip_gram else \"mean\",\n", " )\n", "\n", " self.loss = NCELoss(\n", " init=self.init,\n", " optimizer=self.optimizer,\n", " n_classes=self.vocab_size,\n", " subtract_log_label_prob=False,\n", " noise_sampler=self._noise_sampler,\n", " num_negative_samples=self.num_negative_samples,\n", " )\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"Model parameters\"\"\"\n", " param = {\"components\": {\"embeddings\": {}, \"loss\": {}}}\n", " if hasattr(self, \"embeddings\"):\n", " param[\"components\"] = {\n", " \"embeddings\": self.embeddings.parameters,\n", " \"loss\": self.loss.parameters,\n", " }\n", " return param\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Model hyperparameters\"\"\"\n", " hp = {\n", " \"layer\": \"Word2Vec\",\n", " \"init\": self.init,\n", " \"skip_gram\": self.skip_gram,\n", " \"optimizer\": self.optimizer,\n", " \"max_tokens\": self.max_tokens,\n", " \"context_len\": self.context_len,\n", " \"embedding_dim\": self.embedding_dim,\n", " \"noise_dist_power\": self.noise_dist_power,\n", " \"filter_stopwords\": self.filter_stopwords,\n", " \"num_negative_samples\": self.num_negative_samples,\n", " \"vocab_size\": self.vocab_size if hasattr(self, \"vocab_size\") else None,\n", " \"components\": {\"embeddings\": {}, \"loss\": {}},\n", " }\n", "\n", " if hasattr(self, \"embeddings\"):\n", " hp[\"components\"] = {\n", " \"embeddings\": self.embeddings.hyperparameters,\n", " \"loss\": self.loss.hyperparameters,\n", " }\n", " return hp\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"Variables computed during model operation\"\"\"\n", " dv = {\"components\": {\"embeddings\": {}, \"loss\": {}}}\n", " dv.update(self._dv)\n", "\n", " if hasattr(self, \"embeddings\"):\n", " dv[\"components\"] = {\n", " \"embeddings\": self.embeddings.derived_variables,\n", " \"loss\": self.loss.derived_variables,\n", " }\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"Model parameter gradients\"\"\"\n", " grad = {\"components\": {\"embeddings\": {}, \"loss\": {}}}\n", " if hasattr(self, \"embeddings\"):\n", " grad[\"components\"] = {\n", " \"embeddings\": self.embeddings.gradients,\n", " \"loss\": self.loss.gradients,\n", " }\n", " return grad\n", "\n", " def forward(self, X, targets, retain_derived=True):\n", " \"\"\"\n", " Evaluate the network on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in)`\n", " Layer input, representing a minibatch of `n_ex` examples, each\n", " consisting of `n_in` integer word indices\n", " targets : :py:class:`ndarray ` of shape `(n_ex,)`\n", " Target word index for each example in the minibatch.\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If `False`, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " True.\n", "\n", " Returns\n", " -------\n", " loss : float\n", " The loss associated with the current minibatch\n", " y_pred : :py:class:`ndarray ` of shape `(n_ex,)`\n", " The conditional probabilities of the words in `targets` given the\n", " corresponding example / context in `X`.\n", " \"\"\"\n", " X_emb = self.embeddings.forward(X, retain_derived=True)\n", " loss, y_pred = self.loss.loss(X_emb, targets.flatten(), retain_derived=True)\n", " return loss, y_pred\n", "\n", " def backward(self):\n", " \"\"\"\n", " Compute the gradient of the loss wrt the current network parameters.\n", " \"\"\"\n", " dX_emb = self.loss.grad(retain_grads=True, update_params=False)\n", " self.embeddings.backward(dX_emb)\n", "\n", " def update(self, cur_loss=None):\n", " \"\"\"Perform gradient updates\"\"\"\n", " self.loss.update(cur_loss)\n", " self.embeddings.update(cur_loss)\n", " self.flush_gradients()\n", "\n", " def flush_gradients(self):\n", " \"\"\"Reset parameter gradients after update\"\"\"\n", " self.loss.flush_gradients()\n", " self.embeddings.flush_gradients()\n", "\n", " def get_embedding(self, word_ids):\n", " \"\"\"\n", " Retrieve the embeddings for a collection of word IDs.\n", "\n", " Parameters\n", " ----------\n", " word_ids : :py:class:`ndarray ` of shape `(M,)`\n", " An array of word IDs to retrieve embeddings for.\n", "\n", " Returns\n", " -------\n", " embeddings : :py:class:`ndarray ` of shape `(M, n_out)`\n", " The embedding vectors for each of the `M` word IDs.\n", " \"\"\"\n", " if isinstance(word_ids, list):\n", " word_ids = np.array(word_ids)\n", " return self.embeddings.lookup(word_ids)\n", "\n", " def _build_noise_distribution(self):\n", " \"\"\"\n", " Construct the noise distribution for use during negative sampling.\n", "\n", " For a word ``w`` in the corpus, the noise distribution is::\n", "\n", " P_n(w) = Count(w) ** noise_dist_power / Z\n", "\n", " where ``Z`` is a normalizing constant, and `noise_dist_power` is a\n", " hyperparameter of the model. Mikolov et al. report best performance\n", " using a `noise_dist_power` of 0.75.\n", " \"\"\"\n", " if not hasattr(self, \"vocab\"):\n", " raise ValueError(\"Must call `fit` before constructing noise distribution\")\n", "\n", " probs = np.zeros(len(self.vocab))\n", " power = self.hyperparameters[\"noise_dist_power\"]\n", "\n", " for ix, token in enumerate(self.vocab):\n", " count = token.count\n", " probs[ix] = count ** power\n", "\n", " probs /= np.sum(probs)\n", " self._noise_sampler = DiscreteSampler(probs, log=False, with_replacement=False)\n", "\n", " def _train_epoch(self, corpus_fps, encoding):\n", " total_loss = 0\n", " batch_generator = self.minibatcher(corpus_fps, encoding)\n", " for ix, (X, target) in enumerate(batch_generator):\n", " loss = self._train_batch(X, target)\n", " total_loss += loss\n", " if self.verbose:\n", " smooth_loss = 0.99 * smooth_loss + 0.01 * loss if ix > 0 else loss\n", " fstr = \"[Batch {}] Loss: {:.5f} | Smoothed Loss: {:.5f}\"\n", " print(fstr.format(ix + 1, loss, smooth_loss))\n", " return total_loss / (ix + 1)\n", "\n", " def _train_batch(self, X, target):\n", " loss, _ = self.forward(X, target)\n", " self.backward()\n", " self.update(loss)\n", " return loss\n", "\n", " def minibatcher(self, corpus_fps, encoding):\n", " \"\"\"\n", " A minibatch generator for skip-gram and CBOW models.\n", "\n", " Parameters\n", " ----------\n", " corpus_fps : str or list of strs\n", " The filepath / list of filepaths to the document(s) to be encoded.\n", " Each document is expected to be encoded as newline-separated\n", " string of text, with adjacent tokens separated by a whitespace\n", " character.\n", " encoding : str\n", " Specifies the text encoding for corpus. This value is passed\n", " directly to Python's `open` builtin. Common entries are either\n", " 'utf-8' (no header byte), or 'utf-8-sig' (header byte).\n", "\n", " Yields\n", " ------\n", " X : list of length `batchsize` or :py:class:`ndarray ` of shape (`batchsize`, `n_in`)\n", " The context IDs for a minibatch of `batchsize` examples. If\n", " ``self.skip_gram`` is False, `X` will be a ragged list consisting\n", " of `batchsize` variable-length lists. If ``self.skip_gram`` is\n", " `True`, all sublists will be of the same length (`n_in`) and `X`\n", " will be returned as a :py:class:`ndarray ` of shape (`batchsize`, `n_in`).\n", " target : :py:class:`ndarray ` of shape (`batchsize`, 1)\n", " The target IDs associated with each example in `X`\n", " \"\"\"\n", " batchsize = self.batchsize\n", " X_mb, target_mb, mb_ready = [], [], False\n", "\n", " for d_ix, doc_fp in enumerate(corpus_fps):\n", " with open(doc_fp, \"r\", encoding=encoding) as doc:\n", " for line in doc:\n", " words = tokenize_words(\n", " line, lowercase=True, filter_stopwords=self.filter_stopwords\n", " )\n", " word_ixs = self.vocab.words_to_indices(\n", " self.vocab.filter(words, unk=False)\n", " )\n", " for word_loc, word in enumerate(word_ixs):\n", " # since more distant words are usually less related to\n", " # the target word, we downweight them by sampling from\n", " # them less frequently during training.\n", " R = np.random.randint(1, self.context_len)\n", " left = word_ixs[max(word_loc - R, 0) : word_loc]\n", " right = word_ixs[word_loc + 1 : word_loc + 1 + R]\n", " context = left + right\n", "\n", " if len(context) == 0:\n", " continue\n", "\n", " # in the skip-gram architecture we use each of the\n", " # surrounding context to predict `word` / avoid\n", " # predicting negative samples\n", " if self.skip_gram:\n", " X_mb.extend([word] * len(context))\n", " target_mb.extend(context)\n", " mb_ready = len(target_mb) >= batchsize\n", "\n", " # in the CBOW architecture we use the average of the\n", " # context embeddings to predict the target `word` / avoid\n", " # predicting the negative samples\n", " else:\n", " context = np.array(context)\n", " X_mb.append(context) # X_mb will be a ragged array\n", " target_mb.append(word)\n", " mb_ready = len(X_mb) == batchsize\n", "\n", " if mb_ready:\n", " mb_ready = False\n", " X_batch, target_batch = X_mb.copy(), target_mb.copy()\n", " X_mb, target_mb = [], []\n", " if self.skip_gram:\n", " X_batch = np.array(X_batch)[:, None]\n", " target_batch = np.array(target_batch)[:, None]\n", " yield X_batch, target_batch\n", "\n", " # if we've reached the end of our final document and there are\n", " # remaining examples, yield the stragglers as a partial minibatch\n", " if len(X_mb) > 0:\n", " if self.skip_gram:\n", " X_mb = np.array(X_mb)[:, None]\n", " target_mb = np.array(target_mb)[:, None]\n", " yield X_mb, target_mb\n", "\n", " def fit(\n", " self, corpus_fps, encoding=\"utf-8-sig\", n_epochs=20, batchsize=128, verbose=True\n", " ):\n", " \"\"\"\n", " Learn word2vec embeddings for the examples in `X_train`.\n", "\n", " Parameters\n", " ----------\n", " corpus_fps : str or list of strs\n", " The filepath / list of filepaths to the document(s) to be encoded.\n", " Each document is expected to be encoded as newline-separated\n", " string of text, with adjacent tokens separated by a whitespace\n", " character.\n", " encoding : str\n", " Specifies the text encoding for corpus. Common entries are either\n", " 'utf-8' (no header byte), or 'utf-8-sig' (header byte). Default\n", " value is 'utf-8-sig'.\n", " n_epochs : int\n", " The maximum number of training epochs to run. Default is 20.\n", " batchsize : int\n", " The desired number of examples in each training batch. Default is\n", " 128.\n", " verbose : bool\n", " Print batch information during training. Default is True.\n", " \"\"\"\n", " self.verbose = verbose\n", " self.n_epochs = n_epochs\n", " self.batchsize = batchsize\n", "\n", " self.vocab = Vocabulary(\n", " lowercase=True,\n", " min_count=self.min_count,\n", " max_tokens=self.max_tokens,\n", " filter_stopwords=self.filter_stopwords,\n", " )\n", " self.vocab.fit(corpus_fps, encoding=encoding)\n", " self.vocab_size = len(self.vocab)\n", "\n", " # ignore special characters when training the model\n", " for sp in self.special_chars:\n", " self.vocab.counts[sp] = 0\n", "\n", " # now that we know our vocabulary size, we can initialize the embeddings\n", " self._init_params()\n", "\n", " prev_loss = np.inf\n", " for i in range(n_epochs):\n", " loss, estart = 0.0, time()\n", " loss = self._train_epoch(corpus_fps, encoding)\n", "\n", " fstr = \"[Epoch {}] Avg. loss: {:.3f} Delta: {:.3f} ({:.2f}m/epoch)\"\n", " print(fstr.format(i + 1, loss, prev_loss - loss, (time() - estart) / 60.0))\n", " prev_loss = loss\n"]} {"path": "numpy_ml/neural_nets/models/wgan_gp.py", "content": ["from time import time\n", "from collections import OrderedDict\n", "\n", "import numpy as np\n", "\n", "from ..utils import minibatch\n", "from ..layers import FullyConnected\n", "from ..losses import WGAN_GPLoss\n", "\n", "\n", "class WGAN_GP(object):\n", " \"\"\"\n", " A Wasserstein generative adversarial network (WGAN) architecture with\n", " gradient penalty (GP).\n", "\n", " Notes\n", " -----\n", " In contrast to a regular WGAN, WGAN-GP uses gradient penalty on the\n", " generator rather than weight clipping to encourage the 1-Lipschitz\n", " constraint:\n", "\n", " .. math::\n", "\n", " | \\\\text{Generator}(\\mathbf{x}_1) - \\\\text{Generator}(\\mathbf{x}_2) |\n", " \\leq |\\mathbf{x}_1 - \\mathbf{x}_2 | \\ \\ \\ \\ \\\\forall \\mathbf{x}_1, \\mathbf{x}_2\n", "\n", " In other words, the generator must have input gradients with a norm of at\n", " most 1 under the :math:`\\mathbf{X}_{real}` and :math:`\\mathbf{X}_{fake}`\n", " data distributions.\n", "\n", " To enforce this constraint, WGAN-GP penalizes the model if the generator\n", " gradient norm moves away from a target norm of 1. See\n", " :class:`~numpy_ml.neural_nets.losses.WGAN_GPLoss` for more details.\n", "\n", " In contrast to a standard WGAN, WGAN-GP avoids using BatchNorm in the\n", " critic, as correlation between samples in a batch can impact the stability\n", " of the gradient penalty.\n", "\n", " WGAP-GP architecture:\n", "\n", " .. code-block:: text\n", "\n", " X_real ------------------------|\n", " >---> [Critic] --> Y_out\n", " Z --> [Generator] --> X_fake --|\n", "\n", " where ``[Generator]`` is\n", "\n", " .. code-block:: text\n", "\n", " FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4\n", "\n", " and ``[Critic]`` is\n", "\n", " .. code-block:: text\n", "\n", " FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4\n", "\n", " and\n", "\n", " .. math::\n", "\n", " Z \\sim \\mathcal{N}(0, 1)\n", " \"\"\"\n", "\n", " def __init__(\n", " self,\n", " g_hidden=512,\n", " init=\"he_uniform\",\n", " optimizer=\"RMSProp(lr=0.0001)\",\n", " debug=False,\n", " ):\n", " \"\"\"\n", " Wasserstein generative adversarial network with gradient penalty.\n", "\n", " Parameters\n", " ----------\n", " g_hidden : int\n", " The number of units in the critic and generator hidden layers.\n", " Default is 512.\n", " init : str\n", " The weight initialization strategy. Valid entries are\n", " {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform',\n", " 'std_normal', 'trunc_normal'}. Default is \"he_uniform\".\n", " optimizer : str or :doc:`Optimizer ` object or None\n", " The optimization strategy to use when performing gradient updates.\n", " If None, use the :class:`~numpy_ml.neural_nets.optimizers.SGD`\n", " optimizer with default parameters. Default is \"RMSProp(lr=0.0001)\".\n", " debug : bool\n", " Whether to store additional intermediate output within\n", " ``self.derived_variables``. Default is False.\n", " \"\"\"\n", " self.init = init\n", " self.debug = debug\n", " self.g_hidden = g_hidden\n", " self.optimizer = optimizer\n", "\n", " self.lambda_ = None\n", " self.n_steps = None\n", " self.batchsize = None\n", "\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self._dv = {}\n", " self._gr = {}\n", " self._build_critic()\n", " self._build_generator()\n", " self.is_initialized = True\n", "\n", " def _build_generator(self):\n", " \"\"\"\n", " FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4\n", " \"\"\"\n", " self.generator = OrderedDict()\n", " self.generator[\"FC1\"] = FullyConnected(\n", " self.g_hidden, act_fn=\"ReLU\", optimizer=self.optimizer, init=self.init\n", " )\n", " self.generator[\"FC2\"] = FullyConnected(\n", " self.g_hidden, act_fn=\"ReLU\", optimizer=self.optimizer, init=self.init\n", " )\n", " self.generator[\"FC3\"] = FullyConnected(\n", " self.g_hidden, act_fn=\"ReLU\", optimizer=self.optimizer, init=self.init\n", " )\n", " self.generator[\"FC4\"] = FullyConnected(\n", " self.n_feats,\n", " act_fn=\"Affine(slope=1, intercept=0)\",\n", " optimizer=self.optimizer,\n", " init=self.init,\n", " )\n", "\n", " def _build_critic(self):\n", " \"\"\"\n", " FC1 -> ReLU -> FC2 -> ReLU -> FC3 -> ReLU -> FC4\n", " \"\"\"\n", " self.critic = OrderedDict()\n", " self.critic[\"FC1\"] = FullyConnected(\n", " self.g_hidden, act_fn=\"ReLU\", optimizer=self.optimizer, init=self.init\n", " )\n", " self.critic[\"FC2\"] = FullyConnected(\n", " self.g_hidden, act_fn=\"ReLU\", optimizer=self.optimizer, init=self.init\n", " )\n", " self.critic[\"FC3\"] = FullyConnected(\n", " self.g_hidden, act_fn=\"ReLU\", optimizer=self.optimizer, init=self.init\n", " )\n", " self.critic[\"FC4\"] = FullyConnected(\n", " 1,\n", " act_fn=\"Affine(slope=1, intercept=0)\",\n", " optimizer=self.optimizer,\n", " init=self.init,\n", " )\n", "\n", " @property\n", " def hyperparameters(self):\n", " return {\n", " \"init\": self.init,\n", " \"lambda_\": self.lambda_,\n", " \"g_hidden\": self.g_hidden,\n", " \"n_steps\": self.n_steps,\n", " \"optimizer\": self.optimizer,\n", " \"batchsize\": self.batchsize,\n", " \"c_updates_per_epoch\": self.c_updates_per_epoch,\n", " \"components\": {\n", " \"critic\": {k: v.hyperparameters for k, v in self.critic.items()},\n", " \"generator\": {k: v.hyperparameters for k, v in self.generator.items()},\n", " },\n", " }\n", "\n", " @property\n", " def parameters(self):\n", " return {\n", " \"components\": {\n", " \"critic\": {k: v.parameters for k, v in self.critic.items()},\n", " \"generator\": {k: v.parameters for k, v in self.generator.items()},\n", " }\n", " }\n", "\n", " @property\n", " def derived_variables(self):\n", " C = self.critic.items()\n", " G = self.generator.items()\n", " dv = {\n", " \"components\": {\n", " \"critic\": {k: v.derived_variables for k, v in C},\n", " \"generator\": {k: v.derived_variables for k, v in G},\n", " }\n", " }\n", " dv.update(self._dv)\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " grads = {\n", " \"dC_Y_fake\": None,\n", " \"dC_Y_real\": None,\n", " \"dG_Y_fake\": None,\n", " \"dC_gradInterp\": None,\n", " \"components\": {\n", " \"critic\": {k: v.gradients for k, v in self.critic.items()},\n", " \"generator\": {k: v.gradients for k, v in self.generator.items()},\n", " },\n", " }\n", " grads.update(self._gr)\n", " return grads\n", "\n", " def forward(self, X, module, retain_derived=True):\n", " \"\"\"\n", " Perform the forward pass for either the generator or the critic.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(batchsize, \\*)`\n", " Input data\n", " module : {'C' or 'G'}\n", " Whether to perform the forward pass for the critic ('C') or for the\n", " generator ('G').\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(batchsize, \\*)`\n", " The output of the final layer of the module.\n", " Xs : dict\n", " A dictionary with layer ids as keys and values corresponding to the\n", " input to each intermediate layer during the forward pass. Useful\n", " during debugging.\n", " \"\"\"\n", " if module == \"G\":\n", " mod = self.generator\n", " elif module == \"C\":\n", " mod = self.critic\n", " else:\n", " raise ValueError(\"Unrecognized module name: {}\".format(module))\n", "\n", " Xs = {}\n", " out, rd = X, retain_derived\n", " for k, v in mod.items():\n", " Xs[k] = out\n", " out = v.forward(out, retain_derived=rd)\n", " return out, Xs\n", "\n", " def backward(self, grad, module, retain_grads=True):\n", " \"\"\"\n", " Perform the backward pass for either the generator or the critic.\n", "\n", " Parameters\n", " ----------\n", " grad : :py:class:`ndarray ` of shape `(batchsize, \\*)` or list of arrays\n", " Gradient of the loss with respect to module output(s).\n", " module : {'C' or 'G'}\n", " Whether to perform the backward pass for the critic ('C') or for the\n", " generator ('G').\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is True.\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(batchsize, \\*)`\n", " The gradient of the loss with respect to the module input.\n", " dXs : dict\n", " A dictionary with layer ids as keys and values corresponding to the\n", " input to each intermediate layer during the backward pass. Useful\n", " during debugging.\n", " \"\"\"\n", " if module == \"G\":\n", " mod = self.generator\n", " elif module == \"C\":\n", " mod = self.critic\n", " else:\n", " raise ValueError(\"Unrecognized module name: {}\".format(module))\n", "\n", " dXs = {}\n", " out, rg = grad, retain_grads\n", " for k, v in reversed(list(mod.items())):\n", " dXs[k] = out\n", " out = v.backward(out, retain_grads=rg)\n", " return out, dXs\n", "\n", " def _dGradInterp(self, dLdGradInterp, dYi_outs):\n", " \"\"\"\n", " Compute the gradient penalty's contribution to the critic loss and\n", " update the parameter gradients accordingly.\n", "\n", " Parameters\n", " ----------\n", " dLdGradInterp : :py:class:`ndarray ` of shape `(batchsize, critic_in_dim)`\n", " Gradient of `Y_interp` with respect to `X_interp`.\n", " dYi_outs : dict\n", " The intermediate outputs generated during the backward pass when\n", " computing `dLdGradInterp`.\n", " \"\"\"\n", " dy = dLdGradInterp\n", " for k, v in self.critic.items():\n", " X = v.X[-1] # layer input during forward pass\n", " dy, dW, dB = v._bwd2(dy, X, dYi_outs[k][2])\n", " self.critic[k].gradients[\"W\"] += dW\n", " self.critic[k].gradients[\"b\"] += dB\n", "\n", " def update_critic(self, X_real):\n", " \"\"\"\n", " Compute parameter gradients for the critic on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X_real : :py:class:`ndarray ` of shape `(batchsize, n_feats)`\n", " Input data.\n", "\n", " Returns\n", " -------\n", " C_loss : float\n", " The critic loss on the current data.\n", " \"\"\"\n", " self.flush_gradients(\"C\")\n", "\n", " n_ex = X_real.shape[0]\n", " noise = np.random.randn(*X_real.shape)\n", "\n", " # generate and score the real and fake data\n", " X_fake, Xf_outs = self.forward(noise, \"G\")\n", " Y_real, Yr_outs = self.forward(X_real, \"C\")\n", " Y_fake, Yf_outs = self.forward(X_fake, \"C\")\n", "\n", " # sample a random point on the linear interpolation between real and\n", " # fake data and compute its score\n", " alpha = np.random.rand(n_ex, 1)\n", " X_interp = alpha * X_real + (1 - alpha) * X_fake\n", " Y_interp, Yi_outs = self.forward(X_interp, \"C\")\n", "\n", " # compute the gradient of Y_interp wrt. X_interp\n", " # Note that we don't save intermediate gradients here since this is not\n", " # the real backward pass\n", " dLdy = [0, 0, np.ones_like(Y_interp)]\n", " (_, _, gradInterp), dYi_outs = self.backward(dLdy, \"C\", retain_grads=False)\n", "\n", " # calculate critic loss and differentiate with respect to each term\n", " C_loss = self.loss(Y_fake, \"C\", Y_real, gradInterp)\n", " dY_real, dY_fake, dGrad_interp = self.loss.grad(Y_fake, \"C\", Y_real, gradInterp)\n", "\n", " # compute `dY_real` and `dY_fake` contributions to critic loss, update\n", " # param gradients accordingly\n", " self.backward([dY_real, dY_fake, 0], \"C\")\n", "\n", " # compute `gradInterp`'s contribution to the critic loss, updating\n", " # param gradients accordingly\n", " self._dGradInterp(dGrad_interp, dYi_outs)\n", "\n", " # cache intermediate vars for the generator update\n", " self._dv[\"alpha\"] = alpha\n", " self._dv[\"Y_fake\"] = Y_fake\n", "\n", " # log additional intermediate values for debugging\n", " if self.debug:\n", " self._dv[\"G_fwd_X_fake\"] = {}\n", " self._dv[\"C_fwd_Y_real\"] = {}\n", " self._dv[\"C_fwd_Y_fake\"] = {}\n", " self._dv[\"C_fwd_Y_interp\"] = {}\n", "\n", " N = len(self.critic.keys())\n", " N2 = len(self.generator.keys())\n", "\n", " for i in range(N2):\n", " self._dv[\"G_fwd_X_fake\"][\"FC\" + str(i)] = Xf_outs[\"FC\" + str(i + 1)]\n", "\n", " for i in range(N):\n", " self._dv[\"C_fwd_Y_real\"][\"FC\" + str(i)] = Yr_outs[\"FC\" + str(i + 1)]\n", " self._dv[\"C_fwd_Y_fake\"][\"FC\" + str(i)] = Yf_outs[\"FC\" + str(i + 1)]\n", " self._dv[\"C_fwd_Y_interp\"][\"FC\" + str(i)] = Yi_outs[\"FC\" + str(i + 1)]\n", "\n", " self._dv[\"C_fwd_Y_real\"][\"FC\" + str(N)] = Y_real\n", " self._dv[\"C_fwd_Y_fake\"][\"FC\" + str(N)] = Y_fake\n", " self._dv[\"G_fwd_X_fake\"][\"FC\" + str(N2)] = X_fake\n", " self._dv[\"C_fwd_Y_interp\"][\"FC\" + str(N)] = Y_interp\n", " self._dv[\"C_dY_interp_wrt\"] = {k: v[2] for k, v in dYi_outs.items()}\n", "\n", " self._dv[\"noise\"] = noise\n", " self._dv[\"X_fake\"] = X_fake\n", " self._dv[\"X_real\"] = X_real\n", " self._dv[\"Y_real\"] = Y_real\n", " self._dv[\"Y_fake\"] = Y_fake\n", " self._dv[\"C_loss\"] = C_loss\n", " self._dv[\"dY_real\"] = dY_real\n", " self._dv[\"dC_Y_fake\"] = dY_fake\n", " self._dv[\"X_interp\"] = X_interp\n", " self._dv[\"Y_interp\"] = Y_interp\n", " self._dv[\"gradInterp\"] = gradInterp\n", " self._dv[\"dGrad_interp\"] = dGrad_interp\n", "\n", " return C_loss\n", "\n", " def update_generator(self, X_shape):\n", " \"\"\"\n", " Compute parameter gradients for the generator on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X_shape : tuple of `(batchsize, n_feats)`\n", " Shape for the input batch.\n", "\n", " Returns\n", " -------\n", " G_loss : float\n", " The generator loss on the fake data (generated during the critic\n", " update)\n", " \"\"\"\n", " self.flush_gradients(\"G\")\n", " Y_fake = self.derived_variables[\"Y_fake\"]\n", "\n", " n_ex, _ = Y_fake.shape\n", " G_loss = -Y_fake.mean()\n", " dG_loss = -np.ones_like(Y_fake) / n_ex\n", " self.backward(dG_loss, \"G\")\n", "\n", " if self.debug:\n", " self._dv[\"G_loss\"] = G_loss\n", " self._dv[\"dG_Y_fake\"] = dG_loss\n", "\n", " return G_loss\n", "\n", " def flush_gradients(self, module):\n", " \"\"\"Reset parameter gradients to 0 after an update.\"\"\"\n", " if module == \"G\":\n", " mod = self.generator\n", " elif module == \"C\":\n", " mod = self.critic\n", " else:\n", " raise ValueError(\"Unrecognized module name: {}\".format(module))\n", "\n", " for k, v in mod.items():\n", " v.flush_gradients()\n", "\n", " def update(self, module, module_loss=None):\n", " \"\"\"Perform gradient updates and flush gradients upon completion\"\"\"\n", " if module == \"G\":\n", " mod = self.generator\n", " elif module == \"C\":\n", " mod = self.critic\n", " else:\n", " raise ValueError(\"Unrecognized module name: {}\".format(module))\n", "\n", " for k, v in reversed(list(mod.items())):\n", " v.update(module_loss)\n", " self.flush_gradients(module)\n", "\n", " def fit(\n", " self,\n", " X_real,\n", " lambda_,\n", " n_steps=1000,\n", " batchsize=128,\n", " c_updates_per_epoch=5,\n", " verbose=True,\n", " ):\n", " \"\"\"\n", " Fit WGAN_GP on a training dataset.\n", "\n", " Parameters\n", " ----------\n", " X_real : :py:class:`ndarray ` of shape `(n_ex, n_feats)`\n", " Training dataset\n", " lambda_ : float\n", " Gradient penalty coefficient for the critic loss\n", " n_steps : int\n", " The maximum number of generator updates to perform. Default is\n", " 1000.\n", " batchsize : int\n", " Number of examples to use in each training minibatch. Default is\n", " 128.\n", " c_updates_per_epoch : int\n", " The number of critic updates to perform at each generator update.\n", " verbose : bool\n", " Print loss values after each update. If False, only print loss\n", " every 100 steps. Default is True.\n", " \"\"\"\n", " self.lambda_ = lambda_\n", " self.verbose = verbose\n", " self.n_steps = n_steps\n", " self.batchsize = batchsize\n", " self.c_updates_per_epoch = c_updates_per_epoch\n", "\n", " # adjust output of the generator to match the dimensionality of X\n", " if not self.is_initialized:\n", " self.n_feats = X_real.shape[1]\n", " self._init_params()\n", "\n", " # (re-)initialize loss\n", " prev_C, prev_G = np.inf, np.inf\n", " self.loss = WGAN_GPLoss(lambda_=self.lambda_)\n", "\n", " # training loop\n", " NC, NG = self.c_updates_per_epoch, self.n_steps\n", " for i in range(NG):\n", " estart = time()\n", " batch_generator, _ = minibatch(X_real, batchsize, shuffle=False)\n", "\n", " for j, b_ix in zip(range(NC), batch_generator):\n", " bstart = time()\n", " X_batch = X_real[b_ix]\n", " C_loss = self.update_critic(X_batch)\n", "\n", " # for testing, don't perform gradient update so we can inspect each grad\n", " if not self.debug:\n", " self.update(\"C\", C_loss)\n", "\n", " if self.verbose:\n", " fstr = \"\\t[Critic batch {}] Critic loss: {:.3f} {:.3f}\u2206 ({:.1f}s/batch)\"\n", " print(fstr.format(j + 1, C_loss, prev_C - C_loss, time() - bstart))\n", " prev_C = C_loss\n", "\n", " # generator update\n", " G_loss = self.update_generator(X_batch.shape)\n", "\n", " # for testing, don't perform gradient update so we can inspect each grad\n", " if not self.debug:\n", " self.update(\"G\", G_loss)\n", "\n", " if i % 99 == 0:\n", " fstr = \"[Epoch {}] Gen. loss: {:.3f} Critic loss: {:.3f}\"\n", " print(fstr.format(i + 1, G_loss, C_loss))\n", "\n", " elif self.verbose:\n", " fstr = \"[Epoch {}] Gen. loss: {:.3f} {:.3f}\u2206 ({:.1f}s/epoch)\"\n", " print(fstr.format(i + 1, G_loss, prev_G - G_loss, time() - estart))\n", " prev_G = G_loss\n"]} {"path": "numpy_ml/neural_nets/models/vae.py", "content": ["from time import time\n", "from collections import OrderedDict\n", "\n", "import numpy as np\n", "\n", "from ..losses import VAELoss\n", "from ..utils import minibatch\n", "from ..activations import ReLU, Affine, Sigmoid\n", "from ..layers import Conv2D, Pool2D, Flatten, FullyConnected\n", "\n", "\n", "class BernoulliVAE(object):\n", " def __init__(\n", " self,\n", " T=5,\n", " latent_dim=256,\n", " enc_conv1_pad=0,\n", " enc_conv2_pad=0,\n", " enc_conv1_out_ch=32,\n", " enc_conv2_out_ch=64,\n", " enc_conv1_stride=1,\n", " enc_pool1_stride=2,\n", " enc_conv2_stride=1,\n", " enc_pool2_stride=1,\n", " enc_conv1_kernel_shape=(5, 5),\n", " enc_pool1_kernel_shape=(2, 2),\n", " enc_conv2_kernel_shape=(5, 5),\n", " enc_pool2_kernel_shape=(2, 2),\n", " optimizer=\"RMSProp(lr=0.0001)\",\n", " init=\"glorot_uniform\",\n", " ):\n", " \"\"\"\n", " A variational autoencoder (VAE) with 2D convolutional encoder and Bernoulli\n", " input and output units.\n", "\n", " Notes\n", " -----\n", " The VAE architecture is\n", "\n", " .. code-block:: text\n", "\n", " |-- t_mean ----|\n", " X -> [Encoder] -| |--> [Sampler] -> [Decoder] -> X_recon\n", " |-- t_log_var -|\n", "\n", " where ``[Encoder]`` is\n", "\n", " .. code-block:: text\n", "\n", " Conv1 -> ReLU -> MaxPool1 -> Conv2 -> ReLU ->\n", " MaxPool2 -> Flatten -> FC1 -> ReLU -> FC2\n", "\n", " ``[Decoder]`` is\n", "\n", " .. code-block:: text\n", "\n", " FC1 -> FC2 -> Sigmoid\n", "\n", " and ``[Sampler]`` draws a sample from the distribution\n", "\n", " .. math::\n", "\n", " \\mathcal{N}(\\\\text{t_mean}, \\exp \\left\\{\\\\text{t_log_var}\\\\right\\} I)\n", "\n", " using the reparameterization trick.\n", "\n", " Parameters\n", " ----------\n", " T : int\n", " The dimension of the variational parameter `t`. Default is 5.\n", " enc_conv1_pad : int\n", " The padding for the first convolutional layer of the encoder. Default is 0.\n", " enc_conv1_stride : int\n", " The stride for the first convolutional layer of the encoder. Default is 1.\n", " enc_conv1_out_ch : int\n", " The number of output channels for the first convolutional layer of\n", " the encoder. Default is 32.\n", " enc_conv1_kernel_shape : tuple\n", " The number of rows and columns in each filter of the first\n", " convolutional layer of the encoder. Default is (5, 5).\n", " enc_pool1_kernel_shape : tuple\n", " The number of rows and columns in the receptive field of the first\n", " max pool layer of the encoder. Default is (2, 3).\n", " enc_pool1_stride : int\n", " The stride for the first MaxPool layer of the encoder. Default is\n", " 2.\n", " enc_conv2_pad : int\n", " The padding for the second convolutional layer of the encoder.\n", " Default is 0.\n", " enc_conv2_out_ch : int\n", " The number of output channels for the second convolutional layer of\n", " the encoder. Default is 64.\n", " enc_conv2_kernel_shape : tuple\n", " The number of rows and columns in each filter of the second\n", " convolutional layer of the encoder. Default is (5, 5).\n", " enc_conv2_stride : int\n", " The stride for the second convolutional layer of the encoder.\n", " Default is 1.\n", " enc_pool2_stride : int\n", " The stride for the second MaxPool layer of the encoder. Default is\n", " 1.\n", " enc_pool2_kernel_shape : tuple\n", " The number of rows and columns in the receptive field of the second\n", " max pool layer of the encoder. Default is (2, 3).\n", " latent_dim : int\n", " The dimension of the output for the first FC layer of the encoder.\n", " Default is 256.\n", " optimizer : str or :doc:`Optimizer ` object or None\n", " The optimization strategy to use when performing gradient updates.\n", " If None, use the :class:`~numpy_ml.neural_nets.optimizers.SGD`\n", " optimizer with default parameters. Default is \"RMSProp(lr=0.0001)\".\n", " init : str\n", " The weight initialization strategy. Valid entries are\n", " {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform',\n", " 'std_normal', 'trunc_normal'}. Default is 'glorot_uniform'.\n", " \"\"\"\n", " self.T = T\n", " self.init = init\n", " self.loss = VAELoss()\n", " self.optimizer = optimizer\n", " self.latent_dim = latent_dim\n", " self.enc_conv1_pad = enc_conv1_pad\n", " self.enc_conv2_pad = enc_conv2_pad\n", " self.enc_conv1_stride = enc_conv1_stride\n", " self.enc_conv1_out_ch = enc_conv1_out_ch\n", " self.enc_pool1_stride = enc_pool1_stride\n", " self.enc_conv2_out_ch = enc_conv2_out_ch\n", " self.enc_conv2_stride = enc_conv2_stride\n", " self.enc_pool2_stride = enc_pool2_stride\n", " self.enc_conv2_kernel_shape = enc_conv2_kernel_shape\n", " self.enc_pool2_kernel_shape = enc_pool2_kernel_shape\n", " self.enc_conv1_kernel_shape = enc_conv1_kernel_shape\n", " self.enc_pool1_kernel_shape = enc_pool1_kernel_shape\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self._dv = {}\n", " self._build_encoder()\n", " self._build_decoder()\n", "\n", " def _build_encoder(self):\n", " \"\"\"\n", " CNN encoder\n", "\n", " Conv1 -> ReLU -> MaxPool1 -> Conv2 -> ReLU -> MaxPool2 ->\n", " Flatten -> FC1 -> ReLU -> FC2\n", " \"\"\"\n", " self.encoder = OrderedDict()\n", " self.encoder[\"Conv1\"] = Conv2D(\n", " act_fn=ReLU(),\n", " init=self.init,\n", " pad=self.enc_conv1_pad,\n", " optimizer=self.optimizer,\n", " out_ch=self.enc_conv1_out_ch,\n", " stride=self.enc_conv1_stride,\n", " kernel_shape=self.enc_conv1_kernel_shape,\n", " )\n", " self.encoder[\"Pool1\"] = Pool2D(\n", " mode=\"max\",\n", " optimizer=self.optimizer,\n", " stride=self.enc_pool1_stride,\n", " kernel_shape=self.enc_pool1_kernel_shape,\n", " )\n", " self.encoder[\"Conv2\"] = Conv2D(\n", " act_fn=ReLU(),\n", " init=self.init,\n", " pad=self.enc_conv2_pad,\n", " optimizer=self.optimizer,\n", " out_ch=self.enc_conv2_out_ch,\n", " stride=self.enc_conv2_stride,\n", " kernel_shape=self.enc_conv2_kernel_shape,\n", " )\n", " self.encoder[\"Pool2\"] = Pool2D(\n", " mode=\"max\",\n", " optimizer=self.optimizer,\n", " stride=self.enc_pool2_stride,\n", " kernel_shape=self.enc_pool2_kernel_shape,\n", " )\n", " self.encoder[\"Flatten3\"] = Flatten(optimizer=self.optimizer)\n", " self.encoder[\"FC4\"] = FullyConnected(\n", " n_out=self.latent_dim, act_fn=ReLU(), optimizer=self.optimizer\n", " )\n", " self.encoder[\"FC5\"] = FullyConnected(\n", " n_out=self.T * 2,\n", " optimizer=self.optimizer,\n", " act_fn=Affine(slope=1, intercept=0),\n", " init=self.init,\n", " )\n", "\n", " def _build_decoder(self):\n", " \"\"\"\n", " MLP decoder\n", "\n", " FC1 -> ReLU -> FC2 -> Sigmoid\n", " \"\"\"\n", " self.decoder = OrderedDict()\n", " self.decoder[\"FC1\"] = FullyConnected(\n", " act_fn=ReLU(),\n", " init=self.init,\n", " n_out=self.latent_dim,\n", " optimizer=self.optimizer,\n", " )\n", " # NB. `n_out` is dependent on the dimensionality of X. we use a\n", " # placeholder for now, and update it within the `forward` method\n", " self.decoder[\"FC2\"] = FullyConnected(\n", " n_out=None, act_fn=Sigmoid(), optimizer=self.optimizer, init=self.init\n", " )\n", "\n", " @property\n", " def parameters(self):\n", " return {\n", " \"components\": {\n", " \"encoder\": {k: v.parameters for k, v in self.encoder.items()},\n", " \"decoder\": {k: v.parameters for k, v in self.decoder.items()},\n", " }\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " return {\n", " \"layer\": \"BernoulliVAE\",\n", " \"T\": self.T,\n", " \"init\": self.init,\n", " \"loss\": str(self.loss),\n", " \"optimizer\": self.optimizer,\n", " \"latent_dim\": self.latent_dim,\n", " \"enc_conv1_pad\": self.enc_conv1_pad,\n", " \"enc_conv2_pad\": self.enc_conv2_pad,\n", " \"enc_conv1_in_ch\": self.enc_conv1_in_ch,\n", " \"enc_conv1_stride\": self.enc_conv1_stride,\n", " \"enc_conv1_out_ch\": self.enc_conv1_out_ch,\n", " \"enc_pool1_stride\": self.enc_pool1_stride,\n", " \"enc_conv2_out_ch\": self.enc_conv2_out_ch,\n", " \"enc_conv2_stride\": self.enc_conv2_stride,\n", " \"enc_pool2_stride\": self.enc_pool2_stride,\n", " \"enc_conv2_kernel_shape\": self.enc_conv2_kernel_shape,\n", " \"enc_pool2_kernel_shape\": self.enc_pool2_kernel_shape,\n", " \"enc_conv1_kernel_shape\": self.enc_conv1_kernel_shape,\n", " \"enc_pool1_kernel_shape\": self.enc_pool1_kernel_shape,\n", " \"encoder_ids\": list(self.encoder.keys()),\n", " \"decoder_ids\": list(self.decoder.keys()),\n", " \"components\": {\n", " \"encoder\": {k: v.hyperparameters for k, v in self.encoder.items()},\n", " \"decoder\": {k: v.hyperparameters for k, v in self.decoder.items()},\n", " },\n", " }\n", "\n", " @property\n", " def derived_variables(self):\n", " dv = {\n", " \"noise\": None,\n", " \"t_mean\": None,\n", " \"t_log_var\": None,\n", " \"dDecoder_FC1_in\": None,\n", " \"dDecoder_t_mean\": None,\n", " \"dEncoder_FC5_out\": None,\n", " \"dDecoder_FC1_out\": None,\n", " \"dEncoder_FC4_out\": None,\n", " \"dEncoder_Pool2_out\": None,\n", " \"dEncoder_Conv2_out\": None,\n", " \"dEncoder_Pool1_out\": None,\n", " \"dEncoder_Conv1_out\": None,\n", " \"dDecoder_t_log_var\": None,\n", " \"dEncoder_Flatten3_out\": None,\n", " \"components\": {\n", " \"encoder\": {k: v.derived_variables for k, v in self.encoder.items()},\n", " \"decoder\": {k: v.derived_variables for k, v in self.decoder.items()},\n", " },\n", " }\n", " dv.update(self._dv)\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " return {\n", " \"components\": {\n", " \"encoder\": {k: v.gradients for k, v in self.encoder.items()},\n", " \"decoder\": {k: v.gradients for k, v in self.decoder.items()},\n", " }\n", " }\n", "\n", " def _sample(self, t_mean, t_log_var):\n", " \"\"\"\n", " Returns a sample from the distribution\n", "\n", " q(t | x) = N(t_mean, diag(exp(t_log_var)))\n", "\n", " using the reparameterization trick.\n", "\n", " Parameters\n", " ----------\n", " t_mean : :py:class:`ndarray ` of shape `(n_ex, latent_dim)`\n", " Mean of the desired distribution.\n", " t_log_var : :py:class:`ndarray ` of shape `(n_ex, latent_dim)`\n", " Log variance vector of the desired distribution.\n", "\n", " Returns\n", " -------\n", " samples: :py:class:`ndarray ` of shape `(n_ex, latent_dim)`\n", " \"\"\"\n", " noise = np.random.normal(loc=0.0, scale=1.0, size=t_mean.shape)\n", " samples = noise * np.exp(t_log_var) + t_mean\n", " # save sampled noise for backward pass\n", " self._dv[\"noise\"] = noise\n", " return samples\n", "\n", " def forward(self, X_train):\n", " \"\"\"VAE forward pass\"\"\"\n", " if self.decoder[\"FC2\"].n_out is None:\n", " fc2 = self.decoder[\"FC2\"]\n", " self.decoder[\"FC2\"] = fc2.set_params({\"n_out\": self.N})\n", "\n", " # assume each image is represented as a flattened row vector,\n", " n_ex, in_rows, N, in_ch = X_train.shape\n", "\n", " # encode the training batch to estimate the mean and variance of the\n", " # variational distribution\n", " out = X_train\n", " for k, v in self.encoder.items():\n", " out = v.forward(out)\n", "\n", " # extract the mean and log variance of the variational distribution\n", " # q(t | x) from the encoder output\n", " t_mean = out[:, : self.T]\n", " t_log_var = out[:, self.T :]\n", "\n", " # sample t from q(t | x) using reparamterization trick\n", " t = self._sample(t_mean, t_log_var)\n", "\n", " # pass the sampled latent value, t, through the decoder\n", " # to generate the average reconstruction\n", " X_recon = t\n", " for k, v in self.decoder.items():\n", " X_recon = v.forward(X_recon)\n", "\n", " self._dv[\"t_mean\"] = t_mean\n", " self._dv[\"t_log_var\"] = t_log_var\n", " return X_recon\n", "\n", " def backward(self, X_train, X_recon):\n", " \"\"\"VAE backward pass\"\"\"\n", " n_ex = X_train.shape[0]\n", " D, E = self.decoder, self.encoder\n", " noise = self.derived_variables[\"noise\"]\n", " t_mean = self.derived_variables[\"t_mean\"]\n", " t_log_var = self.derived_variables[\"t_log_var\"]\n", "\n", " # compute gradients through the VAE loss\n", " dY_pred, dLogVar, dMean = self.loss.grad(\n", " X_train.reshape(n_ex, -1), X_recon, t_mean, t_log_var\n", " )\n", "\n", " # backprop through the decoder\n", " dDecoder_FC1_out = D[\"FC2\"].backward(dY_pred)\n", " dDecoder_FC1_in = D[\"FC1\"].backward(dDecoder_FC1_out)\n", "\n", " # backprop through the sampler\n", " dDecoder_t_log_var = dDecoder_FC1_in * (noise * np.exp(t_log_var))\n", " dDecoder_t_mean = dDecoder_FC1_in\n", "\n", " # backprop through the encoder\n", " dEncoder_FC5_out = np.hstack(\n", " [dDecoder_t_mean + dMean, dDecoder_t_log_var + dLogVar]\n", " )\n", " dEncoder_FC4_out = E[\"FC5\"].backward(dEncoder_FC5_out)\n", " dEncoder_Flatten3_out = E[\"FC4\"].backward(dEncoder_FC4_out)\n", " dEncoder_Pool2_out = E[\"Flatten3\"].backward(dEncoder_Flatten3_out)\n", " dEncoder_Conv2_out = E[\"Pool2\"].backward(dEncoder_Pool2_out)\n", " dEncoder_Pool1_out = E[\"Conv2\"].backward(dEncoder_Conv2_out)\n", " dEncoder_Conv1_out = E[\"Pool1\"].backward(dEncoder_Pool1_out)\n", " dX = E[\"Conv1\"].backward(dEncoder_Conv1_out)\n", "\n", " self._dv[\"dDecoder_t_mean\"] = dDecoder_t_mean\n", " self._dv[\"dDecoder_FC1_in\"] = dDecoder_FC1_in\n", " self._dv[\"dDecoder_FC1_out\"] = dDecoder_FC1_out\n", " self._dv[\"dEncoder_FC5_out\"] = dEncoder_FC5_out\n", " self._dv[\"dEncoder_FC4_out\"] = dEncoder_FC4_out\n", " self._dv[\"dDecoder_t_log_var\"] = dDecoder_t_log_var\n", " self._dv[\"dEncoder_Pool2_out\"] = dEncoder_Pool2_out\n", " self._dv[\"dEncoder_Conv2_out\"] = dEncoder_Conv2_out\n", " self._dv[\"dEncoder_Pool1_out\"] = dEncoder_Pool1_out\n", " self._dv[\"dEncoder_Conv1_out\"] = dEncoder_Conv1_out\n", " self._dv[\"dEncoder_Flatten3_out\"] = dEncoder_Flatten3_out\n", " return dX\n", "\n", " def update(self, cur_loss=None):\n", " \"\"\"Perform gradient updates\"\"\"\n", " for k, v in reversed(list(self.decoder.items())):\n", " v.update(cur_loss)\n", " for k, v in reversed(list(self.encoder.items())):\n", " v.update(cur_loss)\n", " self.flush_gradients()\n", "\n", " def flush_gradients(self):\n", " \"\"\"Reset parameter gradients after update\"\"\"\n", " for k, v in self.decoder.items():\n", " v.flush_gradients()\n", " for k, v in self.encoder.items():\n", " v.flush_gradients()\n", "\n", " def fit(self, X_train, n_epochs=20, batchsize=128, verbose=True):\n", " \"\"\"\n", " Fit the VAE to a training dataset.\n", "\n", " Parameters\n", " ----------\n", " X_train : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The input volume\n", " n_epochs : int\n", " The maximum number of training epochs to run. Default is 20.\n", " batchsize : int\n", " The desired number of examples in each training batch. Default is 128.\n", " verbose : bool\n", " Print batch information during training. Default is True.\n", " \"\"\"\n", " self.verbose = verbose\n", " self.n_epochs = n_epochs\n", " self.batchsize = batchsize\n", "\n", " _, self.in_rows, self.in_cols, self.in_ch = X_train.shape\n", " self.N = self.in_rows * self.in_cols * self.in_ch\n", "\n", " prev_loss = np.inf\n", " for i in range(n_epochs):\n", " loss, estart = 0.0, time()\n", " batch_generator, nb = minibatch(X_train, batchsize, shuffle=True)\n", "\n", " # TODO: parallelize inner loop\n", " for j, b_ix in enumerate(batch_generator):\n", " bsize, bstart = len(b_ix), time()\n", "\n", " X_batch = X_train[b_ix]\n", " X_batch_col = X_train[b_ix].reshape(bsize, -1)\n", "\n", " X_recon = self.forward(X_batch)\n", " t_mean = self.derived_variables[\"t_mean\"]\n", " t_log_var = self.derived_variables[\"t_log_var\"]\n", "\n", " self.backward(X_batch, X_recon)\n", " batch_loss = self.loss(X_batch_col, X_recon, t_mean, t_log_var)\n", " loss += batch_loss\n", "\n", " self.update(batch_loss)\n", "\n", " if self.verbose:\n", " fstr = \"\\t[Batch {}/{}] Train loss: {:.3f} ({:.1f}s/batch)\"\n", " print(fstr.format(j + 1, nb, batch_loss, time() - bstart))\n", "\n", " loss /= nb\n", " fstr = \"[Epoch {}] Avg. loss: {:.3f} Delta: {:.3f} ({:.2f}m/epoch)\"\n", " print(fstr.format(i + 1, loss, prev_loss - loss, (time() - estart) / 60.0))\n", " prev_loss = loss\n"]} {"path": "numpy_ml/neural_nets/optimizers/__init__.py", "content": ["from .optimizers import *\n"]} {"path": "numpy_ml/neural_nets/optimizers/optimizers.py", "content": ["from copy import deepcopy\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "from numpy.linalg import norm\n", "\n", "\n", "class OptimizerBase(ABC):\n", " def __init__(self, lr, scheduler=None):\n", " \"\"\"\n", " An abstract base class for all Optimizer objects.\n", "\n", " This should never be used directly.\n", " \"\"\"\n", " from ..initializers import SchedulerInitializer\n", "\n", " self.cache = {}\n", " self.cur_step = 0\n", " self.hyperparameters = {}\n", " self.lr_scheduler = SchedulerInitializer(scheduler, lr=lr)()\n", "\n", " def __call__(self, param, param_grad, param_name, cur_loss=None):\n", " return self.update(param, param_grad, param_name, cur_loss)\n", "\n", " def step(self):\n", " \"\"\"Increment the optimizer step counter by 1\"\"\"\n", " self.cur_step += 1\n", "\n", " def reset_step(self):\n", " \"\"\"Reset the step counter to zero\"\"\"\n", " self.cur_step = 0\n", "\n", " def copy(self):\n", " \"\"\"Return a copy of the optimizer object\"\"\"\n", " return deepcopy(self)\n", "\n", " def set_params(self, hparam_dict=None, cache_dict=None):\n", " \"\"\"Set the parameters of the optimizer object from a dictionary\"\"\"\n", " from ..initializers import SchedulerInitializer\n", "\n", " if hparam_dict is not None:\n", " for k, v in hparam_dict.items():\n", " if k in self.hyperparameters:\n", " self.hyperparameters[k] = v\n", " if k == \"lr_scheduler\":\n", " self.lr_scheduler = SchedulerInitializer(v, lr=None)()\n", "\n", " if cache_dict is not None:\n", " for k, v in cache_dict.items():\n", " if k in self.cache:\n", " self.cache[k] = v\n", "\n", " @abstractmethod\n", " def update(self, param, param_grad, param_name, cur_loss=None):\n", " raise NotImplementedError\n", "\n", "\n", "class SGD(OptimizerBase):\n", " def __init__(\n", " self, lr=0.01, momentum=0.0, clip_norm=None, lr_scheduler=None, **kwargs\n", " ):\n", " \"\"\"\n", " A stochastic gradient descent optimizer.\n", "\n", " Notes\n", " -----\n", " For model parameters :math:`\\\\theta`, averaged parameter gradients\n", " :math:`\\\\nabla_{\\\\theta} \\mathcal{L}`, and learning rate :math:`\\eta`,\n", " the SGD update at timestep `t` is\n", "\n", " .. math::\n", "\n", " \\\\text{update}^{(t)}\n", " &= \\\\text{momentum} \\cdot \\\\text{update}^{(t-1)} + \\eta^{(t)} \\\\nabla_{\\\\theta} \\mathcal{L}\\\\\\\\\n", " \\\\theta^{(t+1)}\n", " &\\leftarrow \\\\theta^{(t)} - \\\\text{update}^{(t)}\n", "\n", " Parameters\n", " ----------\n", " lr : float\n", " Learning rate for SGD. If scheduler is not None, this is used as\n", " the starting learning rate. Default is 0.01.\n", " momentum : float in range [0, 1]\n", " The fraction of the previous update to add to the current update.\n", " If 0, no momentum is applied. Default is 0.\n", " clip_norm : float\n", " If not None, all param gradients are scaled to have maximum l2 norm of\n", " `clip_norm` before computing update. Default is None.\n", " lr_scheduler : str, :doc:`Scheduler ` object, or None\n", " The learning rate scheduler. If None, use a constant learning\n", " rate equal to `lr`. Default is None.\n", " \"\"\"\n", " super().__init__(lr, lr_scheduler)\n", "\n", " self.hyperparameters = {\n", " \"id\": \"SGD\",\n", " \"lr\": lr,\n", " \"momentum\": momentum,\n", " \"clip_norm\": clip_norm,\n", " \"lr_scheduler\": str(self.lr_scheduler),\n", " }\n", "\n", " def __str__(self):\n", " H = self.hyperparameters\n", " lr, mm, cn, sc = H[\"lr\"], H[\"momentum\"], H[\"clip_norm\"], H[\"lr_scheduler\"]\n", " return \"SGD(lr={}, momentum={}, clip_norm={}, lr_scheduler={})\".format(\n", " lr, mm, cn, sc\n", " )\n", "\n", " def update(self, param, param_grad, param_name, cur_loss=None):\n", " \"\"\"\n", " Compute the SGD update for a given parameter\n", "\n", " Parameters\n", " ----------\n", " param : :py:class:`ndarray ` of shape (n, m)\n", " The value of the parameter to be updated.\n", " param_grad : :py:class:`ndarray ` of shape (n, m)\n", " The gradient of the loss function with respect to `param_name`.\n", " param_name : str\n", " The name of the parameter.\n", " cur_loss : float\n", " The training or validation loss for the current minibatch. Used for\n", " learning rate scheduling e.g., by\n", " :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " updated_params : :py:class:`ndarray ` of shape (n, m)\n", " The value of `param` after applying the momentum update.\n", " \"\"\"\n", " C = self.cache\n", " H = self.hyperparameters\n", " momentum, clip_norm = H[\"momentum\"], H[\"clip_norm\"]\n", " lr = self.lr_scheduler(self.cur_step, cur_loss)\n", "\n", " if param_name not in C:\n", " C[param_name] = np.zeros_like(param_grad)\n", "\n", " # scale gradient to avoid explosion\n", " t = np.inf if clip_norm is None else clip_norm\n", " if norm(param_grad) > t:\n", " param_grad = param_grad * t / norm(param_grad)\n", "\n", " update = momentum * C[param_name] + lr * param_grad\n", " self.cache[param_name] = update\n", " return param - update\n", "\n", "\n", "#######################################################################\n", "# Adaptive Gradient Methods #\n", "#######################################################################\n", "\n", "\n", "class AdaGrad(OptimizerBase):\n", " def __init__(self, lr=0.01, eps=1e-7, clip_norm=None, lr_scheduler=None, **kwargs):\n", " \"\"\"\n", " An AdaGrad optimizer.\n", "\n", " Notes\n", " -----\n", " Weights that receive large gradients will have their effective learning\n", " rate reduced, while weights that receive small or infrequent updates\n", " will have their effective learning rate increased.\n", "\n", " Equations::\n", "\n", " cache[t] = cache[t-1] + grad[t] ** 2\n", " update[t] = lr * grad[t] / (np.sqrt(cache[t]) + eps)\n", " param[t+1] = param[t] - update[t]\n", "\n", " Note that the ``**`` and `/` operations are elementwise\n", "\n", " \"A downside of Adagrad ... is that the monotonic learning rate usually\n", " proves too aggressive and stops learning too early.\" [1]\n", "\n", " References\n", " ----------\n", " .. [1] Karpathy, A. \"CS231n: Convolutional neural networks for visual\n", " recognition\" https://cs231n.github.io/neural-networks-3/\n", "\n", " Parameters\n", " ----------\n", " lr : float\n", " Global learning rate\n", " eps : float\n", " Smoothing term to avoid divide-by-zero errors in the update calc.\n", " Default is 1e-7.\n", " clip_norm : float or None\n", " If not None, all param gradients are scaled to have maximum `L2` norm of\n", " `clip_norm` before computing update. Default is None.\n", " lr_scheduler : str or :doc:`Scheduler ` object or None\n", " The learning rate scheduler. If None, use a constant learning\n", " rate equal to `lr`. Default is None.\n", " \"\"\"\n", " super().__init__(lr, lr_scheduler)\n", "\n", " self.cache = {}\n", " self.hyperparameters = {\n", " \"id\": \"AdaGrad\",\n", " \"lr\": lr,\n", " \"eps\": eps,\n", " \"clip_norm\": clip_norm,\n", " \"lr_scheduler\": str(self.lr_scheduler),\n", " }\n", "\n", " def __str__(self):\n", " H = self.hyperparameters\n", " lr, eps, cn, sc = H[\"lr\"], H[\"eps\"], H[\"clip_norm\"], H[\"lr_scheduler\"]\n", " return \"AdaGrad(lr={}, eps={}, clip_norm={}, lr_scheduler={})\".format(\n", " lr, eps, cn, sc\n", " )\n", "\n", " def update(self, param, param_grad, param_name, cur_loss=None):\n", " \"\"\"\n", " Compute the AdaGrad update for a given parameter.\n", "\n", " Notes\n", " -----\n", " Adjusts the learning rate of each weight based on the magnitudes of its\n", " gradients (big gradient -> small lr, small gradient -> big lr).\n", "\n", " Parameters\n", " ----------\n", " param : :py:class:`ndarray ` of shape (n, m)\n", " The value of the parameter to be updated\n", " param_grad : :py:class:`ndarray ` of shape (n, m)\n", " The gradient of the loss function with respect to `param_name`\n", " param_name : str\n", " The name of the parameter\n", " cur_loss : float or None\n", " The training or validation loss for the current minibatch. Used for\n", " learning rate scheduling e.g., by\n", " :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " updated_params : :py:class:`ndarray ` of shape (n, m)\n", " The value of `param` after applying the AdaGrad update\n", " \"\"\"\n", " C = self.cache\n", " H = self.hyperparameters\n", " eps, clip_norm = H[\"eps\"], H[\"clip_norm\"]\n", " lr = self.lr_scheduler(self.cur_step, cur_loss)\n", "\n", " if param_name not in C:\n", " C[param_name] = np.zeros_like(param_grad)\n", "\n", " # scale gradient to avoid explosion\n", " t = np.inf if clip_norm is None else clip_norm\n", " if norm(param_grad) > t:\n", " param_grad = param_grad * t / norm(param_grad)\n", "\n", " C[param_name] += param_grad ** 2\n", " update = lr * param_grad / (np.sqrt(C[param_name]) + eps)\n", " self.cache = C\n", " return param - update\n", "\n", "\n", "class RMSProp(OptimizerBase):\n", " def __init__(\n", " self, lr=0.001, decay=0.9, eps=1e-7, clip_norm=None, lr_scheduler=None, **kwargs\n", " ):\n", " \"\"\"\n", " RMSProp optimizer.\n", "\n", " Notes\n", " -----\n", " RMSProp was proposed as a refinement of :class:`AdaGrad` to reduce its\n", " aggressive, monotonically decreasing learning rate.\n", "\n", " RMSProp uses a *decaying average* of the previous squared gradients\n", " (second moment) rather than just the immediately preceding squared\n", " gradient for its `previous_update` value.\n", "\n", " Equations::\n", "\n", " cache[t] = decay * cache[t-1] + (1 - decay) * grad[t] ** 2\n", " update[t] = lr * grad[t] / (np.sqrt(cache[t]) + eps)\n", " param[t+1] = param[t] - update[t]\n", "\n", " Note that the ``**`` and ``/`` operations are elementwise.\n", "\n", " Parameters\n", " ----------\n", " lr : float\n", " Learning rate for update. Default is 0.001.\n", " decay : float in [0, 1]\n", " Rate of decay for the moving average. Typical values are [0.9,\n", " 0.99, 0.999]. Default is 0.9.\n", " eps : float\n", " Constant term to avoid divide-by-zero errors during the update calc. Default is 1e-7.\n", " clip_norm : float or None\n", " If not None, all param gradients are scaled to have maximum l2 norm of\n", " `clip_norm` before computing update. Default is None.\n", " lr_scheduler : str or :doc:`Scheduler ` object or None\n", " The learning rate scheduler. If None, use a constant learning\n", " rate equal to `lr`. Default is None.\n", " \"\"\"\n", " super().__init__(lr, lr_scheduler)\n", "\n", " self.cache = {}\n", " self.hyperparameters = {\n", " \"id\": \"RMSProp\",\n", " \"lr\": lr,\n", " \"eps\": eps,\n", " \"decay\": decay,\n", " \"clip_norm\": clip_norm,\n", " \"lr_scheduler\": str(self.lr_scheduler),\n", " }\n", "\n", " def __str__(self):\n", " H = self.hyperparameters\n", " sc = H[\"lr_scheduler\"]\n", " lr, eps, dc, cn = H[\"lr\"], H[\"eps\"], H[\"decay\"], H[\"clip_norm\"]\n", " return \"RMSProp(lr={}, eps={}, decay={}, clip_norm={}, lr_scheduler={})\".format(\n", " lr, eps, dc, cn, sc\n", " )\n", "\n", " def update(self, param, param_grad, param_name, cur_loss=None):\n", " \"\"\"\n", " Compute the RMSProp update for a given parameter.\n", "\n", " Parameters\n", " ----------\n", " param : :py:class:`ndarray ` of shape (n, m)\n", " The value of the parameter to be updated\n", " param_grad : :py:class:`ndarray ` of shape (n, m)\n", " The gradient of the loss function with respect to `param_name`\n", " param_name : str\n", " The name of the parameter\n", " cur_loss : float or None\n", " The training or validation loss for the current minibatch. Used for\n", " learning rate scheduling e.g., by\n", " :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " updated_params : :py:class:`ndarray ` of shape (n, m)\n", " The value of `param` after applying the RMSProp update.\n", " \"\"\"\n", " C = self.cache\n", " H = self.hyperparameters\n", " eps, decay, clip_norm = H[\"eps\"], H[\"decay\"], H[\"clip_norm\"]\n", " lr = self.lr_scheduler(self.cur_step, cur_loss)\n", "\n", " if param_name not in C:\n", " C[param_name] = np.zeros_like(param_grad)\n", "\n", " # scale gradient to avoid explosion\n", " t = np.inf if clip_norm is None else clip_norm\n", " if norm(param_grad) > t:\n", " param_grad = param_grad * t / norm(param_grad)\n", "\n", " C[param_name] = decay * C[param_name] + (1 - decay) * param_grad ** 2\n", " update = lr * param_grad / (np.sqrt(C[param_name]) + eps)\n", " self.cache = C\n", " return param - update\n", "\n", "\n", "class Adam(OptimizerBase):\n", " def __init__(\n", " self,\n", " lr=0.001,\n", " decay1=0.9,\n", " decay2=0.999,\n", " eps=1e-7,\n", " clip_norm=None,\n", " lr_scheduler=None,\n", " **kwargs\n", " ):\n", " \"\"\"\n", " Adam (adaptive moment estimation) optimization algorithm.\n", "\n", " Notes\n", " -----\n", " Designed to combine the advantages of :class:`AdaGrad`, which works\n", " well with sparse gradients, and :class:`RMSProp`, which works well in\n", " online and non-stationary settings.\n", "\n", " Parameters\n", " ----------\n", " lr : float\n", " Learning rate for update. This parameter is ignored if using\n", " :class:`~numpy_ml.neural_nets.schedulers.NoamScheduler`.\n", " Default is 0.001.\n", " decay1 : float\n", " The rate of decay to use for in running estimate of the first\n", " moment (mean) of the gradient. Default is 0.9.\n", " decay2 : float\n", " The rate of decay to use for in running estimate of the second\n", " moment (variance) of the gradient. Default is 0.999.\n", " eps : float\n", " Constant term to avoid divide-by-zero errors during the update\n", " calc. Default is 1e-7.\n", " clip_norm : float\n", " If not None, all param gradients are scaled to have maximum l2 norm of\n", " `clip_norm` before computing update. Default is None.\n", " lr_scheduler : str, or :doc:`Scheduler ` object, or None\n", " The learning rate scheduler. If None, use a constant learning rate\n", " equal to `lr`. Default is None.\n", " \"\"\"\n", " super().__init__(lr, lr_scheduler)\n", "\n", " self.cache = {}\n", " self.hyperparameters = {\n", " \"id\": \"Adam\",\n", " \"lr\": lr,\n", " \"eps\": eps,\n", " \"decay1\": decay1,\n", " \"decay2\": decay2,\n", " \"clip_norm\": clip_norm,\n", " \"lr_scheduler\": str(self.lr_scheduler),\n", " }\n", "\n", " def __str__(self):\n", " H = self.hyperparameters\n", " lr, d1, d2 = H[\"lr\"], H[\"decay1\"], H[\"decay2\"]\n", " eps, cn, sc = H[\"eps\"], H[\"clip_norm\"], H[\"lr_scheduler\"]\n", " return \"Adam(lr={}, decay1={}, decay2={}, eps={}, clip_norm={}, lr_scheduler={})\".format(\n", " lr, d1, d2, eps, cn, sc\n", " )\n", "\n", " def update(self, param, param_grad, param_name, cur_loss=None):\n", " \"\"\"\n", " Compute the Adam update for a given parameter.\n", "\n", " Parameters\n", " ----------\n", " param : :py:class:`ndarray ` of shape (n, m)\n", " The value of the parameter to be updated.\n", " param_grad : :py:class:`ndarray ` of shape (n, m)\n", " The gradient of the loss function with respect to `param_name`.\n", " param_name : str\n", " The name of the parameter.\n", " cur_loss : float\n", " The training or validation loss for the current minibatch. Used for\n", " learning rate scheduling e.g., by\n", " :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is\n", " None.\n", "\n", " Returns\n", " -------\n", " updated_params : :py:class:`ndarray ` of shape (n, m)\n", " The value of `param` after applying the Adam update.\n", " \"\"\"\n", " C = self.cache\n", " H = self.hyperparameters\n", " d1, d2 = H[\"decay1\"], H[\"decay2\"]\n", " eps, clip_norm = H[\"eps\"], H[\"clip_norm\"]\n", " lr = self.lr_scheduler(self.cur_step, cur_loss)\n", "\n", " if param_name not in C:\n", " C[param_name] = {\n", " \"t\": 0,\n", " \"mean\": np.zeros_like(param_grad),\n", " \"var\": np.zeros_like(param_grad),\n", " }\n", "\n", " # scale gradient to avoid explosion\n", " t = np.inf if clip_norm is None else clip_norm\n", " if norm(param_grad) > t:\n", " param_grad = param_grad * t / norm(param_grad)\n", "\n", " t = C[param_name][\"t\"] + 1\n", " var = C[param_name][\"var\"]\n", " mean = C[param_name][\"mean\"]\n", "\n", " # update cache\n", " C[param_name][\"t\"] = t\n", " C[param_name][\"var\"] = d2 * var + (1 - d2) * param_grad ** 2\n", " C[param_name][\"mean\"] = d1 * mean + (1 - d1) * param_grad\n", " self.cache = C\n", "\n", " # calc unbiased moment estimates and Adam update\n", " v_hat = C[param_name][\"var\"] / (1 - d2 ** t)\n", " m_hat = C[param_name][\"mean\"] / (1 - d1 ** t)\n", " update = lr * m_hat / (np.sqrt(v_hat) + eps)\n", " return param - update\n"]} {"path": "numpy_ml/neural_nets/modules/__init__.py", "content": ["from .modules import *\n"]} {"path": "numpy_ml/neural_nets/modules/modules.py", "content": ["from abc import ABC, abstractmethod\n", "\n", "import re\n", "import numpy as np\n", "\n", "from ..wrappers import Dropout\n", "from ..utils import calc_pad_dims_2D\n", "from ..activations import Tanh, Sigmoid, ReLU, LeakyReLU, Affine\n", "from ..layers import (\n", " DotProductAttention,\n", " FullyConnected,\n", " BatchNorm2D,\n", " Conv1D,\n", " Conv2D,\n", " Multiply,\n", " LSTMCell,\n", " Add,\n", ")\n", "\n", "\n", "class ModuleBase(ABC):\n", " def __init__(self):\n", " self.X = None\n", " self.trainable = True\n", "\n", " super().__init__()\n", "\n", " @abstractmethod\n", " def _init_params(self, **kwargs):\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def forward(self, z, **kwargs):\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def backward(self, out, **kwargs):\n", " raise NotImplementedError\n", "\n", " @property\n", " def components(self):\n", " comps = []\n", " for c in self.hyperparameters[\"component_ids\"]:\n", " if hasattr(self, c):\n", " comps.append(getattr(self, c))\n", " return comps\n", "\n", " def freeze(self):\n", " self.trainable = False\n", " for c in self.components:\n", " c.freeze()\n", "\n", " def unfreeze(self):\n", " self.trainable = True\n", " for c in self.components:\n", " c.unfreeze()\n", "\n", " def update(self, cur_loss=None):\n", " assert self.trainable, \"Layer is frozen\"\n", " for c in self.components:\n", " c.update(cur_loss)\n", " self.flush_gradients()\n", "\n", " def flush_gradients(self):\n", " assert self.trainable, \"Layer is frozen\"\n", "\n", " self.X = []\n", " self._dv = {}\n", " for c in self.components:\n", " for k, v in c.derived_variables.items():\n", " c.derived_variables[k] = None\n", "\n", " for k, v in c.gradients.items():\n", " c.gradients[k] = np.zeros_like(v)\n", "\n", " def set_params(self, summary_dict):\n", " cids = self.hyperparameters[\"component_ids\"]\n", " for k, v in summary_dict[\"parameters\"].items():\n", " if k == \"components\":\n", " for c, cd in summary_dict[\"parameters\"][k].items():\n", " if c in cids:\n", " getattr(self, c).set_params(cd)\n", "\n", " elif k in self.parameters:\n", " self.parameters[k] = v\n", "\n", " for k, v in summary_dict[\"hyperparameters\"].items():\n", " if k == \"components\":\n", " for c, cd in summary_dict[\"hyperparameters\"][k].items():\n", " if c in cids:\n", " getattr(self, c).set_params(cd)\n", "\n", " if k in self.hyperparameters:\n", " if k == \"act_fn\" and v == \"ReLU\":\n", " self.hyperparameters[k] = ReLU()\n", " elif v == \"act_fn\" and v == \"Sigmoid\":\n", " self.hyperparameters[k] = Sigmoid()\n", " elif v == \"act_fn\" and v == \"Tanh\":\n", " self.hyperparameters[k] = Tanh()\n", " elif v == \"act_fn\" and \"Affine\" in v:\n", " r = r\"Affine\\(slope=(.*), intercept=(.*)\\)\"\n", " slope, intercept = re.match(r, v).groups()\n", " self.hyperparameters[k] = Affine(float(slope), float(intercept))\n", " elif v == \"act_fn\" and \"Leaky ReLU\" in v:\n", " r = r\"Leaky ReLU\\(alpha=(.*)\\)\"\n", " alpha = re.match(r, v).groups()[0]\n", " self.hyperparameters[k] = LeakyReLU(float(alpha))\n", " else:\n", " self.hyperparameters[k] = v\n", "\n", " def summary(self):\n", " return {\n", " \"parameters\": self.parameters,\n", " \"layer\": self.hyperparameters[\"layer\"],\n", " \"hyperparameters\": self.hyperparameters,\n", " }\n", "\n", "\n", "class WavenetResidualModule(ModuleBase):\n", " def __init__(\n", " self,\n", " ch_residual,\n", " ch_dilation,\n", " dilation,\n", " kernel_width,\n", " optimizer=None,\n", " init=\"glorot_uniform\",\n", " ):\n", " \"\"\"\n", " A WaveNet-like residual block with causal dilated convolutions.\n", "\n", " .. code-block:: text\n", "\n", " *Skip path in* >-------------------------------------------> + ---> *Skip path out*\n", " Causal |--> Tanh --| |\n", " *Main |--> Dilated Conv1D -| * --> 1x1 Conv1D --|\n", " path >--| |--> Sigm --| |\n", " in* |-------------------------------------------------> + ---> *Main path out*\n", " *Residual path*\n", "\n", " On the final block, the output of the skip path is further processed to\n", " produce the network predictions.\n", "\n", " References\n", " ----------\n", " .. [1] van den Oord et al. (2016). \"Wavenet: a generative model for raw\n", " audio\". https://arxiv.org/pdf/1609.03499.pdf\n", "\n", " Parameters\n", " ----------\n", " ch_residual : int\n", " The number of output channels for the 1x1\n", " :class:`~numpy_ml.neural_nets.layers.Conv1D` layer in the main path.\n", " ch_dilation : int\n", " The number of output channels for the causal dilated\n", " :class:`~numpy_ml.neural_nets.layers.Conv1D` layer in the main path.\n", " dilation : int\n", " The dilation rate for the causal dilated\n", " :class:`~numpy_ml.neural_nets.layers.Conv1D` layer in the main path.\n", " kernel_width : int\n", " The width of the causal dilated\n", " :class:`~numpy_ml.neural_nets.layers.Conv1D` kernel in the main\n", " path.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is 'glorot_uniform'.\n", " optimizer : str or :doc:`Optimizer ` object or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the\n", " :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with default\n", " parameters. Default is None.\n", " \"\"\"\n", " super().__init__()\n", "\n", " self.init = init\n", " self.dilation = dilation\n", " self.optimizer = optimizer\n", " self.ch_residual = ch_residual\n", " self.ch_dilation = ch_dilation\n", " self.kernel_width = kernel_width\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self._dv = {}\n", "\n", " self.conv_dilation = Conv1D(\n", " stride=1,\n", " pad=\"causal\",\n", " init=self.init,\n", " kernel_width=2,\n", " dilation=self.dilation,\n", " out_ch=self.ch_dilation,\n", " optimizer=self.optimizer,\n", " act_fn=Affine(slope=1, intercept=0),\n", " )\n", "\n", " self.tanh = Tanh()\n", " self.sigm = Sigmoid()\n", " self.multiply_gate = Multiply(act_fn=Affine(slope=1, intercept=0))\n", "\n", " self.conv_1x1 = Conv1D(\n", " stride=1,\n", " pad=\"same\",\n", " dilation=0,\n", " init=self.init,\n", " kernel_width=1,\n", " out_ch=self.ch_residual,\n", " optimizer=self.optimizer,\n", " act_fn=Affine(slope=1, intercept=0),\n", " )\n", "\n", " self.add_residual = Add(act_fn=Affine(slope=1, intercept=0))\n", " self.add_skip = Add(act_fn=Affine(slope=1, intercept=0))\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the module parameters.\"\"\"\n", " return {\n", " \"components\": {\n", " \"conv_1x1\": self.conv_1x1.parameters,\n", " \"add_skip\": self.add_skip.parameters,\n", " \"add_residual\": self.add_residual.parameters,\n", " \"conv_dilation\": self.conv_dilation.parameters,\n", " \"multiply_gate\": self.multiply_gate.parameters,\n", " }\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the module hyperparameters\"\"\"\n", " return {\n", " \"layer\": \"WavenetResidualModule\",\n", " \"init\": self.init,\n", " \"dilation\": self.dilation,\n", " \"optimizer\": self.optimizer,\n", " \"ch_residual\": self.ch_residual,\n", " \"ch_dilation\": self.ch_dilation,\n", " \"kernel_width\": self.kernel_width,\n", " \"component_ids\": [\n", " \"conv_1x1\",\n", " \"add_skip\",\n", " \"add_residual\",\n", " \"conv_dilation\",\n", " \"multiply_gate\",\n", " ],\n", " \"components\": {\n", " \"conv_1x1\": self.conv_1x1.hyperparameters,\n", " \"add_skip\": self.add_skip.hyperparameters,\n", " \"add_residual\": self.add_residual.hyperparameters,\n", " \"conv_dilation\": self.conv_dilation.hyperparameters,\n", " \"multiply_gate\": self.multiply_gate.hyperparameters,\n", " },\n", " }\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"A dictionary of intermediate values computed during the\n", " forward/backward passes.\"\"\"\n", " dv = {\n", " \"conv_1x1_out\": None,\n", " \"conv_dilation_out\": None,\n", " \"multiply_gate_out\": None,\n", " \"components\": {\n", " \"conv_1x1\": self.conv_1x1.derived_variables,\n", " \"add_skip\": self.add_skip.derived_variables,\n", " \"add_residual\": self.add_residual.derived_variables,\n", " \"conv_dilation\": self.conv_dilation.derived_variables,\n", " \"multiply_gate\": self.multiply_gate.derived_variables,\n", " },\n", " }\n", " dv.update(self._dv)\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"A dictionary of the module parameter gradients.\"\"\"\n", " return {\n", " \"components\": {\n", " \"conv_1x1\": self.conv_1x1.gradients,\n", " \"add_skip\": self.add_skip.gradients,\n", " \"add_residual\": self.add_residual.gradients,\n", " \"conv_dilation\": self.conv_dilation.gradients,\n", " \"multiply_gate\": self.multiply_gate.gradients,\n", " }\n", " }\n", "\n", " def forward(self, X_main, X_skip=None):\n", " \"\"\"\n", " Compute the module output on a single minibatch.\n", "\n", " Parameters\n", " ----------\n", " X_main : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The input volume consisting of `n_ex` examples, each with dimension\n", " (`in_rows`, `in_cols`, `in_ch`).\n", " X_skip : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`, or None\n", " The output of the preceding skip-connection if this is not the\n", " first module in the network.\n", "\n", " Returns\n", " -------\n", " Y_main : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The output of the main pathway.\n", " Y_skip : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The output of the skip-connection pathway.\n", " \"\"\"\n", " self.X_main, self.X_skip = X_main, X_skip\n", " conv_dilation_out = self.conv_dilation.forward(X_main)\n", "\n", " tanh_gate = self.tanh.fn(conv_dilation_out)\n", " sigm_gate = self.sigm.fn(conv_dilation_out)\n", "\n", " multiply_gate_out = self.multiply_gate.forward([tanh_gate, sigm_gate])\n", " conv_1x1_out = self.conv_1x1.forward(multiply_gate_out)\n", "\n", " # if this is the first wavenet block, initialize the \"previous\" skip\n", " # connection sum to 0\n", " self.X_skip = np.zeros_like(conv_1x1_out) if X_skip is None else X_skip\n", "\n", " Y_skip = self.add_skip.forward([X_skip, conv_1x1_out])\n", " Y_main = self.add_residual.forward([X_main, conv_1x1_out])\n", "\n", " self._dv[\"tanh_out\"] = tanh_gate\n", " self._dv[\"sigm_out\"] = sigm_gate\n", " self._dv[\"conv_dilation_out\"] = conv_dilation_out\n", " self._dv[\"multiply_gate_out\"] = multiply_gate_out\n", " self._dv[\"conv_1x1_out\"] = conv_1x1_out\n", " return Y_main, Y_skip\n", "\n", " def backward(self, dY_skip, dY_main=None):\n", " dX_skip, dConv_1x1_out = self.add_skip.backward(dY_skip)\n", "\n", " # if this is the last wavenet block, dY_main will be None. if not,\n", " # calculate the error contribution from dY_main and add it to the\n", " # contribution from the skip path\n", " dX_main = np.zeros_like(self.X_main)\n", " if dY_main is not None:\n", " dX_main, dConv_1x1_main = self.add_residual.backward(dY_main)\n", " dConv_1x1_out += dConv_1x1_main\n", "\n", " dMultiply_out = self.conv_1x1.backward(dConv_1x1_out)\n", " dTanh_out, dSigm_out = self.multiply_gate.backward(dMultiply_out)\n", "\n", " conv_dilation_out = self.derived_variables[\"conv_dilation_out\"]\n", " dTanh_in = dTanh_out * self.tanh.grad(conv_dilation_out)\n", " dSigm_in = dSigm_out * self.sigm.grad(conv_dilation_out)\n", " dDilation_out = dTanh_in + dSigm_in\n", "\n", " conv_back = self.conv_dilation.backward(dDilation_out)\n", " dX_main += conv_back\n", "\n", " self._dv[\"dLdTanh\"] = dTanh_out\n", " self._dv[\"dLdSigmoid\"] = dSigm_out\n", " self._dv[\"dLdConv_1x1\"] = dConv_1x1_out\n", " self._dv[\"dLdMultiply\"] = dMultiply_out\n", " self._dv[\"dLdConv_dilation\"] = dDilation_out\n", " return dX_main, dX_skip\n", "\n", "\n", "class SkipConnectionIdentityModule(ModuleBase):\n", " def __init__(\n", " self,\n", " out_ch,\n", " kernel_shape1,\n", " kernel_shape2,\n", " stride1=1,\n", " stride2=1,\n", " act_fn=None,\n", " epsilon=1e-5,\n", " momentum=0.9,\n", " optimizer=None,\n", " init=\"glorot_uniform\",\n", " ):\n", " \"\"\"\n", " A ResNet-like \"identity\" shortcut module.\n", "\n", " Notes\n", " -----\n", " The identity module enforces `same` padding during each convolution to\n", " ensure module output has same dims as its input.\n", "\n", " .. code-block:: text\n", "\n", " X -> Conv2D -> Act_fn -> BatchNorm2D -> Conv2D -> BatchNorm2D -> + -> Act_fn\n", " \\______________________________________________________________/\n", "\n", " References\n", " ----------\n", " .. [1] He et al. (2015). \"Deep residual learning for image\n", " recognition.\" https://arxiv.org/pdf/1512.03385.pdf\n", "\n", " Parameters\n", " ----------\n", " out_ch : int\n", " The number of filters/kernels to compute in the first convolutional\n", " layer.\n", " kernel_shape1 : 2-tuple\n", " The dimension of a single 2D filter/kernel in the first\n", " convolutional layer.\n", " kernel_shape2 : 2-tuple\n", " The dimension of a single 2D filter/kernel in the second\n", " convolutional layer.\n", " stride1 : int\n", " The stride/hop of the convolution kernels in the first\n", " convolutional layer. Default is 1.\n", " stride2 : int\n", " The stride/hop of the convolution kernels in the second\n", " convolutional layer. Default is 1.\n", " act_fn : :doc:`Activation ` object or None\n", " The activation function for computing Y[t]. If None, use the\n", " identity :math:`f(x) = x` by default. Default is None.\n", " epsilon : float\n", " A small smoothing constant to use during\n", " :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` computation to\n", " avoid divide-by-zero errors. Default is 1e-5.\n", " momentum : float\n", " The momentum term for the running mean/running std calculations in\n", " the :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` layers. The\n", " closer this is to 1, the less weight will be given to the mean/std\n", " of the current batch (i.e., higher smoothing). Default is 0.9.\n", " optimizer : str or :doc:`Optimizer ` object or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the\n", " :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with\n", " default parameters. Default is None.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is 'glorot_uniform'.\n", " \"\"\"\n", " super().__init__()\n", "\n", " self.init = init\n", " self.in_ch = None\n", " self.out_ch = out_ch\n", " self.epsilon = epsilon\n", " self.stride1 = stride1\n", " self.stride2 = stride2\n", " self.optimizer = optimizer\n", " self.momentum = momentum\n", " self.kernel_shape1 = kernel_shape1\n", " self.kernel_shape2 = kernel_shape2\n", " self.act_fn = Affine(slope=1, intercept=0) if act_fn is None else act_fn\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self._dv = {}\n", "\n", " self.conv1 = Conv2D(\n", " pad=\"same\",\n", " init=self.init,\n", " out_ch=self.out_ch,\n", " act_fn=self.act_fn,\n", " stride=self.stride1,\n", " optimizer=self.optimizer,\n", " kernel_shape=self.kernel_shape1,\n", " )\n", " # we can't initialize `conv2` without X's dimensions; see `forward`\n", " # for further details\n", " self.batchnorm1 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)\n", " self.batchnorm2 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)\n", " self.add3 = Add(self.act_fn)\n", "\n", " def _init_conv2(self):\n", " self.conv2 = Conv2D(\n", " pad=\"same\",\n", " init=self.init,\n", " out_ch=self.in_ch,\n", " stride=self.stride2,\n", " optimizer=self.optimizer,\n", " kernel_shape=self.kernel_shape2,\n", " act_fn=Affine(slope=1, intercept=0),\n", " )\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the module parameters.\"\"\"\n", " return {\n", " \"components\": {\n", " \"add3\": self.add3.parameters,\n", " \"conv1\": self.conv1.parameters,\n", " \"conv2\": self.conv2.parameters,\n", " \"batchnorm1\": self.batchnorm1.parameters,\n", " \"batchnorm2\": self.batchnorm2.parameters,\n", " }\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the module hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"SkipConnectionIdentityModule\",\n", " \"init\": self.init,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch\": self.out_ch,\n", " \"epsilon\": self.epsilon,\n", " \"stride1\": self.stride1,\n", " \"stride2\": self.stride2,\n", " \"momentum\": self.momentum,\n", " \"optimizer\": self.optimizer,\n", " \"act_fn\": str(self.act_fn),\n", " \"kernel_shape1\": self.kernel_shape1,\n", " \"kernel_shape2\": self.kernel_shape2,\n", " \"component_ids\": [\"conv1\", \"batchnorm1\", \"conv2\", \"batchnorm2\", \"add3\"],\n", " \"components\": {\n", " \"add3\": self.add3.hyperparameters,\n", " \"conv1\": self.conv1.hyperparameters,\n", " \"conv2\": self.conv2.hyperparameters,\n", " \"batchnorm1\": self.batchnorm1.hyperparameters,\n", " \"batchnorm2\": self.batchnorm2.hyperparameters,\n", " },\n", " }\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"A dictionary of intermediate values computed during the\n", " forward/backward passes.\"\"\"\n", " dv = {\n", " \"conv1_out\": None,\n", " \"conv2_out\": None,\n", " \"batchnorm1_out\": None,\n", " \"batchnorm2_out\": None,\n", " \"components\": {\n", " \"add3\": self.add3.derived_variables,\n", " \"conv1\": self.conv1.derived_variables,\n", " \"conv2\": self.conv2.derived_variables,\n", " \"batchnorm1\": self.batchnorm1.derived_variables,\n", " \"batchnorm2\": self.batchnorm2.derived_variables,\n", " },\n", " }\n", " dv.update(self._dv)\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"A dictionary of the accumulated module parameter gradients.\"\"\"\n", " return {\n", " \"components\": {\n", " \"add3\": self.add3.gradients,\n", " \"conv1\": self.conv1.gradients,\n", " \"conv2\": self.conv2.gradients,\n", " \"batchnorm1\": self.batchnorm1.gradients,\n", " \"batchnorm2\": self.batchnorm2.gradients,\n", " }\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the module output given input volume `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape (n_ex, in_rows, in_cols, in_ch)\n", " The input volume consisting of `n_ex` examples, each with dimension\n", " (`in_rows`, `in_cols`, `in_ch`).\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape (n_ex, out_rows, out_cols, out_ch)\n", " The module output volume.\n", " \"\"\"\n", " if not hasattr(self, \"conv2\"):\n", " self.in_ch = X.shape[3]\n", " self._init_conv2()\n", "\n", " conv1_out = self.conv1.forward(X, retain_derived)\n", " bn1_out = self.batchnorm1.forward(conv1_out, retain_derived)\n", " conv2_out = self.conv2.forward(bn1_out, retain_derived)\n", " bn2_out = self.batchnorm2.forward(conv2_out, retain_derived)\n", " Y = self.add3.forward([X, bn2_out], retain_derived)\n", "\n", " if retain_derived:\n", " self._dv[\"conv1_out\"] = conv1_out\n", " self._dv[\"conv2_out\"] = conv2_out\n", " self._dv[\"batchnorm1_out\"] = bn1_out\n", " self._dv[\"batchnorm2_out\"] = bn2_out\n", " return Y\n", "\n", " def backward(self, dLdY, retain_grads=True):\n", " \"\"\"\n", " Compute the gradient of the loss with respect to the layer parameters.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape (`n_ex, out_rows, out_cols, out_ch`) or list of arrays\n", " The gradient(s) of the loss with respect to the module output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape (n_ex, in_rows, in_cols, in_ch)\n", " The gradient of the loss with respect to the module input volume.\n", " \"\"\"\n", " dX, dBn2_out = self.add3.backward(dLdY, retain_grads)\n", " dConv2_out = self.batchnorm2.backward(dBn2_out, retain_grads)\n", " dBn1_out = self.conv2.backward(dConv2_out, retain_grads)\n", " dConv1_out = self.batchnorm1.backward(dBn1_out, retain_grads)\n", " dX += self.conv1.backward(dConv1_out, retain_grads)\n", "\n", " self._dv[\"dLdAdd3_X\"] = dX\n", " self._dv[\"dLdBn2\"] = dBn2_out\n", " self._dv[\"dLdBn1\"] = dBn1_out\n", " self._dv[\"dLdConv2\"] = dConv2_out\n", " self._dv[\"dLdConv1\"] = dConv1_out\n", " return dX\n", "\n", "\n", "class SkipConnectionConvModule(ModuleBase):\n", " def __init__(\n", " self,\n", " out_ch1,\n", " out_ch2,\n", " kernel_shape1,\n", " kernel_shape2,\n", " kernel_shape_skip,\n", " pad1=0,\n", " pad2=0,\n", " stride1=1,\n", " stride2=1,\n", " act_fn=None,\n", " epsilon=1e-5,\n", " momentum=0.9,\n", " stride_skip=1,\n", " optimizer=None,\n", " init=\"glorot_uniform\",\n", " ):\n", " \"\"\"\n", " A ResNet-like \"convolution\" shortcut module.\n", "\n", " Notes\n", " -----\n", " In contrast to :class:`SkipConnectionIdentityModule`, the additional\n", " `conv2d_skip` and `batchnorm_skip` layers in the shortcut path allow\n", " adjusting the dimensions of `X` to match the output of the main set of\n", " convolutions.\n", "\n", " .. code-block:: text\n", "\n", " X -> Conv2D -> Act_fn -> BatchNorm2D -> Conv2D -> BatchNorm2D -> + -> Act_fn\n", " \\_____________________ Conv2D -> Batchnorm2D __________________/\n", "\n", " References\n", " ----------\n", " .. [1] He et al. (2015). \"Deep residual learning for image\n", " recognition.\" https://arxiv.org/pdf/1512.03385.pdf\n", "\n", " Parameters\n", " ----------\n", " out_ch1 : int\n", " The number of filters/kernels to compute in the first convolutional\n", " layer.\n", " out_ch2 : int\n", " The number of filters/kernels to compute in the second\n", " convolutional layer.\n", " kernel_shape1 : 2-tuple\n", " The dimension of a single 2D filter/kernel in the first\n", " convolutional layer.\n", " kernel_shape2 : 2-tuple\n", " The dimension of a single 2D filter/kernel in the second\n", " convolutional layer.\n", " kernel_shape_skip : 2-tuple\n", " The dimension of a single 2D filter/kernel in the \"skip\"\n", " convolutional layer.\n", " stride1 : int\n", " The stride/hop of the convolution kernels in the first\n", " convolutional layer. Default is 1.\n", " stride2 : int\n", " The stride/hop of the convolution kernels in the second\n", " convolutional layer. Default is 1.\n", " stride_skip : int\n", " The stride/hop of the convolution kernels in the \"skip\"\n", " convolutional layer. Default is 1.\n", " pad1 : int, tuple, or 'same'\n", " The number of rows/columns of 0's to pad the input to the first\n", " convolutional layer with. Default is 0.\n", " pad2 : int, tuple, or 'same'\n", " The number of rows/columns of 0's to pad the input to the second\n", " convolutional layer with. Default is 0.\n", " act_fn : :doc:`Activation ` object or None\n", " The activation function for computing ``Y[t]``. If None, use the\n", " identity :math:`f(x) = x` by default. Default is None.\n", " epsilon : float\n", " A small smoothing constant to use during\n", " :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` computation to\n", " avoid divide-by-zero errors. Default is 1e-5.\n", " momentum : float\n", " The momentum term for the running mean/running std calculations in\n", " the :class:`~numpy_ml.neural_nets.layers.BatchNorm2D` layers. The\n", " closer this is to 1, the less weight will be given to the mean/std\n", " of the current batch (i.e., higher smoothing). Default is 0.9.\n", " init : str\n", " The weight initialization strategy. Valid entries are\n", " {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}.\n", " optimizer : str or :doc:`Optimizer ` object\n", " The optimization strategy to use when performing gradient updates\n", " within the :class:`update` method. If None, use the\n", " :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with\n", " default parameters. Default is None.\n", " \"\"\"\n", " super().__init__()\n", "\n", " self.init = init\n", " self.pad1 = pad1\n", " self.pad2 = pad2\n", " self.in_ch = None\n", " self.out_ch1 = out_ch1\n", " self.out_ch2 = out_ch2\n", " self.epsilon = epsilon\n", " self.stride1 = stride1\n", " self.stride2 = stride2\n", " self.momentum = momentum\n", " self.optimizer = optimizer\n", " self.stride_skip = stride_skip\n", " self.kernel_shape1 = kernel_shape1\n", " self.kernel_shape2 = kernel_shape2\n", " self.kernel_shape_skip = kernel_shape_skip\n", " self.act_fn = Affine(slope=1, intercept=0) if act_fn is None else act_fn\n", "\n", " self._init_params()\n", "\n", " def _init_params(self, X=None):\n", " self._dv = {}\n", " self.conv1 = Conv2D(\n", " pad=self.pad1,\n", " init=self.init,\n", " act_fn=self.act_fn,\n", " out_ch=self.out_ch1,\n", " stride=self.stride1,\n", " optimizer=self.optimizer,\n", " kernel_shape=self.kernel_shape1,\n", " )\n", " self.conv2 = Conv2D(\n", " pad=self.pad2,\n", " init=self.init,\n", " out_ch=self.out_ch2,\n", " stride=self.stride2,\n", " optimizer=self.optimizer,\n", " kernel_shape=self.kernel_shape2,\n", " act_fn=Affine(slope=1, intercept=0),\n", " )\n", " # we can't initialize `conv_skip` without X's dimensions; see `forward`\n", " # for further details\n", " self.batchnorm1 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)\n", " self.batchnorm2 = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)\n", " self.batchnorm_skip = BatchNorm2D(epsilon=self.epsilon, momentum=self.momentum)\n", " self.add3 = Add(self.act_fn)\n", "\n", " def _calc_skip_padding(self, X):\n", " pads = []\n", " for p in [self.pad1, self.pad2]:\n", " if isinstance(p, int):\n", " pads.append((p, p, p, p))\n", " elif isinstance(p, tuple) and len(p) == 2:\n", " pads.append((p[0], p[0], p[1], p[1]))\n", " self.pad1, self.pad2 = pads\n", "\n", " # compute the dimensions of the convolution1 output\n", " s1 = self.stride1\n", " fr1, fc1 = self.kernel_shape1\n", " _, in_rows, in_cols, _ = X.shape\n", " pr11, pr12, pc11, pc12 = self.pad1\n", "\n", " out_rows1 = np.floor(1 + (in_rows + pr11 + pr12 - fr1) / s1).astype(int)\n", " out_cols1 = np.floor(1 + (in_cols + pc11 + pc12 - fc1) / s1).astype(int)\n", "\n", " # compute the dimensions of the convolution2 output\n", " s2 = self.stride2\n", " fr2, fc2 = self.kernel_shape2\n", " pr21, pr22, pc21, pc22 = self.pad2\n", "\n", " out_rows2 = np.floor(1 + (out_rows1 + pr21 + pr22 - fr2) / s2).astype(int)\n", " out_cols2 = np.floor(1 + (out_cols1 + pc21 + pc22 - fc2) / s2).astype(int)\n", "\n", " # finally, compute the appropriate padding dims for the skip convolution\n", " desired_dims = (out_rows2, out_cols2)\n", " self.pad_skip = calc_pad_dims_2D(\n", " X.shape,\n", " desired_dims,\n", " stride=self.stride_skip,\n", " kernel_shape=self.kernel_shape_skip,\n", " )\n", "\n", " def _init_conv_skip(self, X):\n", " self._calc_skip_padding(X)\n", " self.conv_skip = Conv2D(\n", " init=self.init,\n", " pad=self.pad_skip,\n", " out_ch=self.out_ch2,\n", " stride=self.stride_skip,\n", " kernel_shape=self.kernel_shape_skip,\n", " act_fn=Affine(slope=1, intercept=0),\n", " optimizer=self.optimizer,\n", " )\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the module parameters.\"\"\"\n", " return {\n", " \"components\": {\n", " \"add3\": self.add3.parameters,\n", " \"conv1\": self.conv1.parameters,\n", " \"conv2\": self.conv2.parameters,\n", " \"conv_skip\": self.conv_skip.parameters\n", " if hasattr(self, \"conv_skip\")\n", " else None,\n", " \"batchnorm1\": self.batchnorm1.parameters,\n", " \"batchnorm2\": self.batchnorm2.parameters,\n", " \"batchnorm_skip\": self.batchnorm_skip.parameters,\n", " }\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the module hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"SkipConnectionConvModule\",\n", " \"init\": self.init,\n", " \"pad1\": self.pad1,\n", " \"pad2\": self.pad2,\n", " \"in_ch\": self.in_ch,\n", " \"out_ch1\": self.out_ch1,\n", " \"out_ch2\": self.out_ch2,\n", " \"epsilon\": self.epsilon,\n", " \"stride1\": self.stride1,\n", " \"stride2\": self.stride2,\n", " \"momentum\": self.momentum,\n", " \"act_fn\": str(self.act_fn),\n", " \"stride_skip\": self.stride_skip,\n", " \"kernel_shape1\": self.kernel_shape1,\n", " \"kernel_shape2\": self.kernel_shape2,\n", " \"kernel_shape_skip\": self.kernel_shape_skip,\n", " \"pad_skip\": self.pad_skip if hasattr(self, \"pad_skip\") else None,\n", " \"component_ids\": [\n", " \"add3\",\n", " \"conv1\",\n", " \"conv2\",\n", " \"conv_skip\",\n", " \"batchnorm1\",\n", " \"batchnorm2\",\n", " \"batchnorm_skip\",\n", " ],\n", " \"components\": {\n", " \"add3\": self.add3.hyperparameters,\n", " \"conv1\": self.conv1.hyperparameters,\n", " \"conv2\": self.conv2.hyperparameters,\n", " \"conv_skip\": self.conv_skip.hyperparameters\n", " if hasattr(self, \"conv_skip\")\n", " else None,\n", " \"batchnorm1\": self.batchnorm1.hyperparameters,\n", " \"batchnorm2\": self.batchnorm2.hyperparameters,\n", " \"batchnorm_skip\": self.batchnorm_skip.hyperparameters,\n", " },\n", " }\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"A dictionary of intermediate values computed during the\n", " forward/backward passes.\"\"\"\n", " dv = {\n", " \"conv1_out\": None,\n", " \"conv2_out\": None,\n", " \"conv_skip_out\": None,\n", " \"batchnorm1_out\": None,\n", " \"batchnorm2_out\": None,\n", " \"batchnorm_skip_out\": None,\n", " \"components\": {\n", " \"add3\": self.add3.derived_variables,\n", " \"conv1\": self.conv1.derived_variables,\n", " \"conv2\": self.conv2.derived_variables,\n", " \"conv_skip\": self.conv_skip.derived_variables\n", " if hasattr(self, \"conv_skip\")\n", " else None,\n", " \"batchnorm1\": self.batchnorm1.derived_variables,\n", " \"batchnorm2\": self.batchnorm2.derived_variables,\n", " \"batchnorm_skip\": self.batchnorm_skip.derived_variables,\n", " },\n", " }\n", " dv.update(self._dv)\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"A dictionary of the accumulated module parameter gradients.\"\"\"\n", " return {\n", " \"components\": {\n", " \"add3\": self.add3.gradients,\n", " \"conv1\": self.conv1.gradients,\n", " \"conv2\": self.conv2.gradients,\n", " \"conv_skip\": self.conv_skip.gradients\n", " if hasattr(self, \"conv_skip\")\n", " else None,\n", " \"batchnorm1\": self.batchnorm1.gradients,\n", " \"batchnorm2\": self.batchnorm2.gradients,\n", " \"batchnorm_skip\": self.batchnorm_skip.gradients,\n", " }\n", " }\n", "\n", " def forward(self, X, retain_derived=True):\n", " \"\"\"\n", " Compute the layer output given input volume `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The input volume consisting of `n_ex` examples, each with dimension\n", " (`in_rows`, `in_cols`, `in_ch`).\n", " retain_derived : bool\n", " Whether to retain the variables calculated during the forward pass\n", " for use later during backprop. If False, this suggests the layer\n", " will not be expected to backprop through wrt. this input. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " The module output volume.\n", " \"\"\"\n", " # now that we have the input dims for X we can initialize the proper\n", " # padding in the `conv_skip` layer\n", " if not hasattr(self, \"conv_skip\"):\n", " self._init_conv_skip(X)\n", " self.in_ch = X.shape[3]\n", "\n", " conv1_out = self.conv1.forward(X, retain_derived)\n", " bn1_out = self.batchnorm1.forward(conv1_out, retain_derived)\n", " conv2_out = self.conv2.forward(bn1_out, retain_derived)\n", " bn2_out = self.batchnorm2.forward(conv2_out, retain_derived)\n", " conv_skip_out = self.conv_skip.forward(X, retain_derived)\n", " bn_skip_out = self.batchnorm_skip.forward(conv_skip_out, retain_derived)\n", " Y = self.add3.forward([bn_skip_out, bn2_out], retain_derived)\n", "\n", " if retain_derived:\n", " self._dv[\"conv1_out\"] = conv1_out\n", " self._dv[\"conv2_out\"] = conv2_out\n", " self._dv[\"batchnorm1_out\"] = bn1_out\n", " self._dv[\"batchnorm2_out\"] = bn2_out\n", " self._dv[\"conv_skip_out\"] = conv_skip_out\n", " self._dv[\"batchnorm_skip_out\"] = bn_skip_out\n", " return Y\n", "\n", " def backward(self, dLdY, retain_grads=True):\n", " \"\"\"\n", " Compute the gradient of the loss with respect to the module parameters.\n", "\n", " Parameters\n", " ----------\n", " dLdy : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, out_ch)`\n", " or list of arrays\n", " The gradient(s) of the loss with respect to the module output(s).\n", " retain_grads : bool\n", " Whether to include the intermediate parameter gradients computed\n", " during the backward pass in the final parameter update. Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " dX : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_ch)`\n", " The gradient of the loss with respect to the module input volume.\n", " \"\"\"\n", " dBnskip_out, dBn2_out = self.add3.backward(dLdY)\n", " dConvskip_out = self.batchnorm_skip.backward(dBnskip_out)\n", " dX = self.conv_skip.backward(dConvskip_out)\n", "\n", " dConv2_out = self.batchnorm2.backward(dBn2_out)\n", " dBn1_out = self.conv2.backward(dConv2_out)\n", " dConv1_out = self.batchnorm1.backward(dBn1_out)\n", " dX += self.conv1.backward(dConv1_out)\n", "\n", " if retain_grads:\n", " self._dv[\"dLdAdd3_X\"] = dX\n", " self._dv[\"dLdBn1\"] = dBn1_out\n", " self._dv[\"dLdBn2\"] = dBn2_out\n", " self._dv[\"dLdConv1\"] = dConv1_out\n", " self._dv[\"dLdConv2\"] = dConv2_out\n", " self._dv[\"dLdBnSkip\"] = dBnskip_out\n", " self._dv[\"dLdConvSkip\"] = dConvskip_out\n", " return dX\n", "\n", "\n", "class BidirectionalLSTM(ModuleBase):\n", " def __init__(\n", " self,\n", " n_out,\n", " act_fn=None,\n", " gate_fn=None,\n", " merge_mode=\"concat\",\n", " init=\"glorot_uniform\",\n", " optimizer=None,\n", " ):\n", " \"\"\"\n", " A single bidirectional long short-term memory (LSTM) layer.\n", "\n", " Parameters\n", " ----------\n", " n_out : int\n", " The dimension of a single hidden state / output on a given timestep\n", " act_fn : :doc:`Activation ` object or None\n", " The activation function for computing ``A[t]``. If not specified,\n", " use :class:`~numpy_ml.neural_nets.activations.Tanh` by default.\n", " gate_fn : :doc:`Activation ` object or None\n", " The gate function for computing the update, forget, and output\n", " gates. If not specified, use\n", " :class:`~numpy_ml.neural_nets.activations.Sigmoid` by default.\n", " merge_mode : {\"sum\", \"multiply\", \"concat\", \"average\"}\n", " Mode by which outputs of the forward and backward LSTMs will be\n", " combined. Default is 'concat'.\n", " optimizer : str or :doc:`Optimizer ` object or None\n", " The optimization strategy to use when performing gradient updates\n", " within the `update` method. If None, use the\n", " :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with\n", " default parameters. Default is None.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is 'glorot_uniform'.\n", " \"\"\"\n", " super().__init__()\n", "\n", " self.init = init\n", " self.n_in = None\n", " self.n_out = n_out\n", " self.optimizer = optimizer\n", " self.merge_mode = merge_mode\n", " self.act_fn = Tanh() if act_fn is None else act_fn\n", " self.gate_fn = Sigmoid() if gate_fn is None else gate_fn\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " self.cell_fwd = LSTMCell(\n", " init=self.init,\n", " n_out=self.n_out,\n", " act_fn=self.act_fn,\n", " gate_fn=self.gate_fn,\n", " optimizer=self.optimizer,\n", " )\n", " self.cell_bwd = LSTMCell(\n", " init=self.init,\n", " n_out=self.n_out,\n", " act_fn=self.act_fn,\n", " gate_fn=self.gate_fn,\n", " optimizer=self.optimizer,\n", " )\n", "\n", " def forward(self, X):\n", " \"\"\"\n", " Run a forward pass across all timesteps in the input.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, n_in, n_t)`\n", " Input consisting of `n_ex` examples each of dimensionality `n_in`\n", " and extending for `n_t` timesteps.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(n_ex, n_out, n_t)`\n", " The value of the hidden state for each of the `n_ex` examples\n", " across each of the `n_t` timesteps.\n", " \"\"\"\n", " Y_fwd, Y_bwd, Y = [], [], []\n", " n_ex, self.n_in, n_t = X.shape\n", "\n", " # forward LSTM\n", " for t in range(n_t):\n", " yt, ct = self.cell_fwd.forward(X[:, :, t])\n", " Y_fwd.append(yt)\n", "\n", " # backward LSTM\n", " for t in reversed(range(n_t)):\n", " yt, ct = self.cell_bwd.forward(X[:, :, t])\n", " Y_bwd.insert(0, yt)\n", "\n", " # merge forward and backward states\n", " for t in range(n_t):\n", " if self.merge_mode == \"concat\":\n", " Y.append(np.concatenate([Y_fwd[t], Y_bwd[t]], axis=1))\n", " elif self.merge_mode == \"sum\":\n", " Y.append(Y_fwd[t] + Y_bwd[t])\n", " elif self.merge_mode == \"average\":\n", " Y.append((Y_fwd[t] + Y_bwd[t]) / 2)\n", " elif self.merge_mode == \"multiply\":\n", " Y.append(Y_fwd[t] * Y_bwd[t])\n", "\n", " self.Y_fwd, self.Y_bwd = Y_fwd, Y_bwd\n", " return np.dstack(Y)\n", "\n", " def backward(self, dLdA):\n", " \"\"\"\n", " Run a backward pass across all timesteps in the input.\n", "\n", " Parameters\n", " ----------\n", " dLdA : :py:class:`ndarray ` of shape `(n_ex, n_out, n_t)`\n", " The gradient of the loss with respect to the layer output for each\n", " of the `n_ex` examples across all `n_t` timesteps.\n", "\n", " Returns\n", " -------\n", " dLdX : :py:class:`ndarray ` of shape `(n_ex, n_in, n_t)`\n", " The value of the hidden state for each of the `n_ex` examples\n", " across each of the `n_t` timesteps.\n", " \"\"\"\n", " assert self.trainable, \"Layer is frozen\"\n", "\n", " n_ex, n_out, n_t = dLdA.shape\n", " dLdX_f, dLdX_b, dLdX = [], [], []\n", "\n", " # forward LSTM\n", " for t in reversed(range(n_t)):\n", " if self.merge_mode == \"concat\":\n", " dLdXt_f = self.cell_fwd.backward(dLdA[:, : self.n_out, t])\n", " elif self.merge_mode == \"sum\":\n", " dLdXt_f = self.cell_fwd.backward(dLdA[:, :, t])\n", " elif self.merge_mode == \"multiplty\":\n", " dLdXt_f = self.cell_fwd.backward(dLdA[:, :, t] * self.Y_bwd[t])\n", " elif self.merge_mode == \"average\":\n", " dLdXt_f = self.cell_fwd.backward(dLdA[:, :, t] * 0.5)\n", " dLdX_f.insert(0, dLdXt_f)\n", "\n", " # backward LSTM\n", " for t in range(n_t):\n", " if self.merge_mode == \"concat\":\n", " dLdXt_b = self.cell_bwd.backward(dLdA[:, self.n_out :, t])\n", " elif self.merge_mode == \"sum\":\n", " dLdXt_b = self.cell_bwd.backward(dLdA[:, :, t])\n", " elif self.merge_mode == \"multiplty\":\n", " dLdXt_b = self.cell_bwd.backward(dLdA[:, :, t] * self.Y_fwd[t])\n", " elif self.merge_mode == \"average\":\n", " dLdXt_b = self.cell_bwd.backward(dLdA[:, :, t] * 0.5)\n", " dLdX_b.append(dLdXt_b)\n", "\n", " for t in range(n_t):\n", " dLdX.append(dLdX_f[t] + dLdX_b[t])\n", "\n", " return np.dstack(dLdX)\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"A dictionary of intermediate values computed during the\n", " forward/backward passes.\"\"\"\n", " return {\n", " \"components\": {\n", " \"cell_fwd\": self.cell_fwd.derived_variables,\n", " \"cell_bwd\": self.cell_bwd.derived_variables,\n", " }\n", " }\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"A dictionary of the accumulated module parameter gradients.\"\"\"\n", " return {\n", " \"components\": {\n", " \"cell_fwd\": self.cell_fwd.gradients,\n", " \"cell_bwd\": self.cell_bwd.gradients,\n", " }\n", " }\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the module parameters.\"\"\"\n", " return {\n", " \"components\": {\n", " \"cell_fwd\": self.cell_fwd.parameters,\n", " \"cell_bwd\": self.cell_bwd.parameters,\n", " }\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the module hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"BidirectionalLSTM\",\n", " \"init\": self.init,\n", " \"n_in\": self.n_in,\n", " \"n_out\": self.n_out,\n", " \"act_fn\": str(self.act_fn),\n", " \"optimizer\": self.optimizer,\n", " \"merge_mode\": self.merge_mode,\n", " \"component_ids\": [\"cell_fwd\", \"cell_bwd\"],\n", " \"components\": {\n", " \"cell_fwd\": self.cell_fwd.hyperparameters,\n", " \"cell_bwd\": self.cell_bwd.hyperparameters,\n", " },\n", " }\n", "\n", "\n", "class MultiHeadedAttentionModule(ModuleBase):\n", " def __init__(self, n_heads=8, dropout_p=0, init=\"glorot_uniform\", optimizer=None):\n", " \"\"\"\n", " A mutli-headed attention module.\n", "\n", " Notes\n", " -----\n", " Multi-head attention allows a model to jointly attend to information from\n", " different representation subspaces at different positions. With a\n", " single head, this information would get averaged away when the\n", " attention weights are combined with the value\n", "\n", " .. math::\n", "\n", " \\\\text{MultiHead}(\\mathbf{Q}, \\mathbf{K}, \\mathbf{V})\n", " = [\\\\text{head}_1; ...; \\\\text{head}_h] \\\\mathbf{W}^{(O)}\n", "\n", " where\n", "\n", " .. math::\n", "\n", " \\\\text{head}_i = \\\\text{SDP_attention}(\n", " \\mathbf{Q W}_i^{(Q)}, \\mathbf{K W}_i^{(K)}, \\mathbf{V W}_i^{(V)})\n", "\n", " and the projection weights are parameter matrices:\n", "\n", " .. math::\n", "\n", " \\mathbf{W}_i^{(Q)} &\\in\n", " \\mathbb{R}^{(\\\\text{kqv_dim} \\ \\\\times \\ \\\\text{latent_dim})} \\\\\\\\\n", " \\mathbf{W}_i^{(K)} &\\in\n", " \\mathbb{R}^{(\\\\text{kqv_dim} \\ \\\\times \\ \\\\text{latent_dim})} \\\\\\\\\n", " \\mathbf{W}_i^{(V)} &\\in\n", " \\mathbb{R}^{(\\\\text{kqv_dim} \\ \\\\times \\ \\\\text{latent_dim})} \\\\\\\\\n", " \\mathbf{W}^{(O)} &\\in\n", " \\mathbb{R}^{(\\\\text{n_heads} \\cdot \\\\text{latent_dim} \\ \\\\times \\ \\\\text{kqv_dim})}\n", "\n", " Importantly, the current module explicitly assumes that\n", "\n", " .. math::\n", "\n", " \\\\text{kqv_dim} = \\\\text{dim(query)} = \\\\text{dim(keys)} = \\\\text{dim(values)}\n", "\n", " and that\n", "\n", " .. math::\n", "\n", " \\\\text{latent_dim} = \\\\text{kqv_dim / n_heads}\n", "\n", " **[MH Attention Head h]**:\n", "\n", " .. code-block:: text\n", "\n", " K --> W_h^(K) ------\\\\\n", " V --> W_h^(V) ------- > DP_Attention --> head_h\n", " Q --> W_h^(Q) ------/\n", "\n", " The full **[MultiHeadedAttentionModule]** then becomes\n", "\n", " .. code-block:: text\n", "\n", " -----------------\n", " K --> | [Attn Head 1] | --> head_1 --\\\\\n", " V --> | [Attn Head 2] | --> head_2 --\\\\\n", " Q --> | ... | ... --> Concat --> W^(O) --> MH_out\n", " | [Attn Head Z] | --> head_Z --/\n", " -----------------\n", "\n", " Due to the reduced dimension of each head, the total computational cost\n", " is similar to that of a single attention head with full (i.e., kqv_dim)\n", " dimensionality.\n", "\n", " Parameters\n", " ----------\n", " n_heads : int\n", " The number of simultaneous attention heads to use. Note that the\n", " larger `n_heads`, the smaller the dimensionality of any single\n", " head, since ``latent_dim = kqv_dim / n_heads``. Default is 8.\n", " dropout_p : float in [0, 1)\n", " The dropout propbability during training, applied to the output of\n", " the softmax in each dot-product attention head. If 0, no dropout is\n", " applied. Default is 0.\n", " init : {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'}\n", " The weight initialization strategy. Default is 'glorot_uniform'.\n", " optimizer : str, :doc:`Optimizer ` object, or None\n", " The optimization strategy to use when performing gradient updates\n", " within the :meth:`update` method. If None, use the\n", " :class:`~numpy_ml.neural_nets.optimizers.SGD` optimizer with default\n", " parameters. Default is None.\n", " \"\"\"\n", " self.init = init\n", " self.kqv_dim = None\n", " self.projections = {}\n", " self.n_heads = n_heads\n", " self.optimizer = optimizer\n", " self.dropout_p = dropout_p\n", " self.is_initialized = False\n", "\n", " def _init_params(self):\n", " self._dv = {}\n", "\n", " # assume dim(keys) = dim(query) = dim(values)\n", " assert self.kqv_dim % self.n_heads == 0\n", " self.latent_dim = self.kqv_dim // self.n_heads\n", "\n", " self.attention = DotProductAttention(scale=True, dropout_p=self.dropout_p)\n", " self.projections = {\n", " k: Dropout(\n", " FullyConnected(\n", " init=self.init,\n", " n_out=self.kqv_dim,\n", " optimizer=self.optimizer,\n", " act_fn=\"Affine(slope=1, intercept=0)\",\n", " ),\n", " self.dropout_p,\n", " )\n", " for k in [\"Q\", \"K\", \"V\", \"O\"]\n", " }\n", "\n", " self.is_initialized = True\n", "\n", " def forward(self, Q, K, V):\n", " if not self.is_initialized:\n", " self.kqv_dim = Q.shape[-1]\n", " self._init_params()\n", "\n", " # project queries, keys, and values into the `latent_dim`-dimensional subspace\n", " n_ex = Q.shape[0]\n", " for k, x in zip([\"Q\", \"K\", \"V\"], [Q, K, V]):\n", " proj = self.projections[k].forward(x)\n", " proj = proj.reshape(n_ex, -1, self.n_heads, self.latent_dim).swapaxes(1, 2)\n", " self._dv[\"{}_proj\".format(k)] = proj\n", "\n", " dv = self.derived_variables\n", " Q_proj, K_proj, V_proj = dv[\"Q_proj\"], dv[\"K_proj\"], dv[\"V_proj\"]\n", "\n", " # apply scaled dot-product attention to the projected vectors\n", " attn = self.attention\n", " attn_out = attn.forward(Q_proj, K_proj, V_proj)\n", " self._dv[\"attention_weights\"] = attn.derived_variables[\"attention_weights\"]\n", "\n", " # concatenate the different heads using `reshape` to create an\n", " # `kqv_dim`-dim vector\n", " attn_out = attn_out.swapaxes(1, 2).reshape(n_ex, self.kqv_dim)\n", " self._dv[\"attention_out\"] = attn_out.reshape(n_ex, -1, self.kqv_dim)\n", "\n", " # apply the final output projection\n", " Y = self.projections[\"O\"].forward(attn_out)\n", " Y = Y.reshape(n_ex, -1, self.kqv_dim)\n", " return Y\n", "\n", " def backward(self, dLdy):\n", " n_ex = dLdy.shape[0]\n", " dLdy = dLdy.reshape(n_ex, self.kqv_dim)\n", " dLdX = self.projections[\"O\"].backward(dLdy)\n", " dLdX = dLdX.reshape(n_ex, self.n_heads, -1, self.latent_dim)\n", "\n", " dLdQ_proj, dLdK_proj, dLdV_proj = self.attention.backward(dLdX)\n", "\n", " self._dv[\"dQ_proj\"] = dLdQ_proj\n", " self._dv[\"dK_proj\"] = dLdK_proj\n", " self._dv[\"dV_proj\"] = dLdV_proj\n", "\n", " dLdQ_proj = dLdQ_proj.reshape(n_ex, self.kqv_dim)\n", " dLdK_proj = dLdK_proj.reshape(n_ex, self.kqv_dim)\n", " dLdV_proj = dLdV_proj.reshape(n_ex, self.kqv_dim)\n", "\n", " dLdQ = self.projections[\"Q\"].backward(dLdQ_proj)\n", " dLdK = self.projections[\"K\"].backward(dLdK_proj)\n", " dLdV = self.projections[\"V\"].backward(dLdV_proj)\n", " return dLdQ, dLdK, dLdV\n", "\n", " @property\n", " def derived_variables(self):\n", " \"\"\"A dictionary of intermediate values computed during the\n", " forward/backward passes.\"\"\"\n", " dv = {\n", " \"Q_proj\": None,\n", " \"K_proj\": None,\n", " \"V_proj\": None,\n", " \"components\": {\n", " \"Q\": self.projections[\"Q\"].derived_variables,\n", " \"K\": self.projections[\"K\"].derived_variables,\n", " \"V\": self.projections[\"V\"].derived_variables,\n", " \"O\": self.projections[\"O\"].derived_variables,\n", " \"attention\": self.attention.derived_variables,\n", " },\n", " }\n", " dv.update(self._dv)\n", " return dv\n", "\n", " @property\n", " def gradients(self):\n", " \"\"\"A dictionary of the accumulated module parameter gradients.\"\"\"\n", " return {\n", " \"components\": {\n", " \"Q\": self.projections[\"Q\"].gradients,\n", " \"K\": self.projections[\"K\"].gradients,\n", " \"V\": self.projections[\"V\"].gradients,\n", " \"O\": self.projections[\"O\"].gradients,\n", " \"attention\": self.attention.gradients,\n", " }\n", " }\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the module parameters.\"\"\"\n", " return {\n", " \"components\": {\n", " \"Q\": self.projections[\"Q\"].parameters,\n", " \"K\": self.projections[\"K\"].parameters,\n", " \"V\": self.projections[\"V\"].parameters,\n", " \"O\": self.projections[\"O\"].parameters,\n", " \"attention\": self.attention.parameters,\n", " }\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the module hyperparameters.\"\"\"\n", " return {\n", " \"layer\": \"MultiHeadedAttentionModule\",\n", " \"init\": self.init,\n", " \"kqv_dim\": self.kqv_dim,\n", " \"latent_dim\": self.latent_dim,\n", " \"n_heads\": self.n_heads,\n", " \"dropout_p\": self.dropout_p,\n", " \"component_ids\": [\"attention\", \"Q\", \"K\", \"V\", \"O\"],\n", " \"components\": {\n", " \"Q\": self.projections[\"Q\"].hyperparameters,\n", " \"K\": self.projections[\"K\"].hyperparameters,\n", " \"V\": self.projections[\"V\"].hyperparameters,\n", " \"O\": self.projections[\"O\"].hyperparameters,\n", " \"attention\": self.attention.hyperparameters,\n", " },\n", " }\n"]} {"path": "numpy_ml/neural_nets/schedulers/__init__.py", "content": ["from .schedulers import *\n"]} {"path": "numpy_ml/neural_nets/schedulers/schedulers.py", "content": ["from copy import deepcopy\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "from math import erf\n", "\n", "\n", "def gaussian_cdf(x, mean, var):\n", " \"\"\"\n", " Compute the probability that a random draw from a 1D Gaussian with mean\n", " `mean` and variance `var` is less than or equal to `x`.\n", " \"\"\"\n", " eps = np.finfo(float).eps\n", " x_scaled = (x - mean) / np.sqrt(var + eps)\n", " return (1 + erf(x_scaled / np.sqrt(2))) / 2\n", "\n", "\n", "class SchedulerBase(ABC):\n", " def __init__(self):\n", " \"\"\"Abstract base class for all Scheduler objects.\"\"\"\n", " self.hyperparameters = {}\n", "\n", " def __call__(self, step=None, cur_loss=None):\n", " return self.learning_rate(step=step, cur_loss=cur_loss)\n", "\n", " def copy(self):\n", " \"\"\"Return a copy of the current object.\"\"\"\n", " return deepcopy(self)\n", "\n", " def set_params(self, hparam_dict):\n", " \"\"\"Set the scheduler hyperparameters from a dictionary.\"\"\"\n", " if hparam_dict is not None:\n", " for k, v in hparam_dict.items():\n", " if k in self.hyperparameters:\n", " self.hyperparameters[k] = v\n", "\n", " @abstractmethod\n", " def learning_rate(self, step=None):\n", " raise NotImplementedError\n", "\n", "\n", "class ConstantScheduler(SchedulerBase):\n", " def __init__(self, lr=0.01, **kwargs):\n", " \"\"\"\n", " Returns a fixed learning rate, regardless of the current step.\n", "\n", " Parameters\n", " ----------\n", " initial_lr : float\n", " The learning rate. Default is 0.01\n", " \"\"\"\n", " super().__init__()\n", " self.lr = lr\n", " self.hyperparameters = {\"id\": \"ConstantScheduler\", \"lr\": self.lr}\n", "\n", " def __str__(self):\n", " return \"ConstantScheduler(lr={})\".format(self.lr)\n", "\n", " def learning_rate(self, **kwargs):\n", " \"\"\"\n", " Return the current learning rate.\n", "\n", " Returns\n", " -------\n", " lr : float\n", " The learning rate\n", " \"\"\"\n", " return self.lr\n", "\n", "\n", "class ExponentialScheduler(SchedulerBase):\n", " def __init__(\n", " self, initial_lr=0.01, stage_length=500, staircase=False, decay=0.1, **kwargs\n", " ):\n", " \"\"\"\n", " An exponential learning rate scheduler.\n", "\n", " Notes\n", " -----\n", " The exponential scheduler decays the learning rate by `decay` every\n", " `stage_length` steps, starting from `initial_lr`::\n", "\n", " learning_rate = initial_lr * decay ** curr_stage\n", "\n", " where::\n", "\n", " curr_stage = step / stage_length if staircase = False\n", " curr_stage = floor(step / stage_length) if staircase = True\n", "\n", " Parameters\n", " ----------\n", " initial_lr : float\n", " The learning rate at the first step. Default is 0.01.\n", " stage_length : int\n", " The length of each stage, in steps. Default is 500.\n", " staircase : bool\n", " If True, only adjusts the learning rate at the stage transitions,\n", " producing a step-like decay schedule. If False, adjusts the\n", " learning rate after each step, creating a smooth decay schedule.\n", " Default is False.\n", " decay : float\n", " The amount to decay the learning rate at each new stage. Default is\n", " 0.1.\n", " \"\"\"\n", " super().__init__()\n", " self.decay = decay\n", " self.staircase = staircase\n", " self.initial_lr = initial_lr\n", " self.stage_length = stage_length\n", " self.hyperparameters = {\n", " \"id\": \"StepScheduler\",\n", " \"decay\": self.decay,\n", " \"staircase\": self.staircase,\n", " \"initial_lr\": self.initial_lr,\n", " \"stage_length\": self.stage_length,\n", " }\n", "\n", " def __str__(self):\n", " return \"ExponentialScheduler(initial_lr={}, stage_length={}, staircase={}, decay={})\".format(\n", " self.initial_lr, self.stage_length, self.staircase, self.decay\n", " )\n", "\n", " def learning_rate(self, step, **kwargs):\n", " \"\"\"\n", " Return the current learning rate as a function of `step`.\n", "\n", " Parameters\n", " ----------\n", " step : int\n", " The current step number.\n", "\n", " Returns\n", " -------\n", " lr : float\n", " The learning rate for the current step.\n", " \"\"\"\n", " cur_stage = step / self.stage_length\n", " if self.staircase:\n", " cur_stage = np.floor(cur_stage)\n", " return self.initial_lr * self.decay ** cur_stage\n", "\n", "\n", "class NoamScheduler(SchedulerBase):\n", " def __init__(self, model_dim=512, scale_factor=1, warmup_steps=4000, **kwargs):\n", " \"\"\"\n", " The Noam learning rate scheduler, originally used in conjunction with\n", " the Adam optimizer in [1].\n", "\n", " Notes\n", " -----\n", " The Noam scheduler increases the learning rate linearly for the first\n", " `warmup_steps` steps, and decreases it thereafter proportionally to the\n", " inverse square root of the step number::\n", "\n", " lr = scale_factor * ( (model_dim ** (-0.5)) * adj_step )\n", " adj_step = min(step_num ** (-0.5), step_num * warmup_steps ** (-1.5))\n", "\n", " References\n", " ----------\n", " .. [1] Vaswani et al. (2017) \"Attention is all you need\". *31st\n", " Conference on Neural Information Processing Systems*,\n", " https://arxiv.org/pdf/1706.03762.pdf\n", "\n", " Parameters\n", " ----------\n", " model_dim : int\n", " The number of units in the layer output. Default is 512.\n", " scale_factor : float\n", " A fixed coefficient for rescaling the final learning rate. Default\n", " is 1.\n", " warmup_steps : int\n", " The number of steps in the warmup stage of training. Default is\n", " 4000.\n", " \"\"\"\n", " super().__init__()\n", " self.model_dim = model_dim\n", " self.scale_factor = scale_factor\n", " self.warmup_steps = warmup_steps\n", " self.hyperparameters = {\n", " \"id\": \"NoamScheduler\",\n", " \"model_dim\": self.model_dim,\n", " \"scale_factor\": self.scale_factor,\n", " \"warmup_steps\": self.warmup_steps,\n", " }\n", "\n", " def __str__(self):\n", " return \"NoamScheduler(model_dim={}, scale_factor={}, warmup_steps={})\".format(\n", " self.model_dim, self.scale_factor, self.warmup_steps\n", " )\n", "\n", " def learning_rate(self, step, **kwargs):\n", " warmup, d_model = self.warmup_steps, self.model_dim\n", " new_lr = d_model ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))\n", " return self.scale_factor * new_lr\n", "\n", "\n", "class KingScheduler(SchedulerBase):\n", " def __init__(self, initial_lr=0.01, patience=1000, decay=0.99, **kwargs):\n", " \"\"\"\n", " The Davis King / DLib learning rate scheduler.\n", "\n", " Notes\n", " -----\n", " The KingScheduler computes the probability that the slope of the OLS\n", " fit to the loss history is negative. If the probability that it is\n", " negative is less than 51% over the last `patience` steps, the scheduler\n", " exponentially decreases the current learning rate by `decay`.\n", "\n", " References\n", " ----------\n", " .. [1] King, D. (2018). \"Automatic learning rate scheduling that really\n", " works\". http://blog.dlib.net/2018/02/automatic-learning-rate-scheduling-that.html\n", "\n", " Parameters\n", " ----------\n", " initial_lr : float\n", " The learning rate to begin at. Default is 0.01.\n", " patience : int\n", " Amount of time to maintain the current learning rate without a\n", " decrease in loss before adjustment. Default is 1000.\n", " decay : float\n", " The amount to decay the learning rate at each new stage. Default is\n", " 0.99.\n", " \"\"\"\n", " super().__init__()\n", " self.decay = decay\n", " self.patience = patience\n", " self.initial_lr = initial_lr\n", " self.current_lr = initial_lr\n", " self.max_history = np.ceil(1.1 * (patience + 1)).astype(int)\n", "\n", " self.loss_history = []\n", " self.hyperparameters = {\n", " \"id\": \"KingScheduler\",\n", " \"decay\": self.decay,\n", " \"patience\": self.patience,\n", " \"initial_lr\": self.initial_lr,\n", " }\n", "\n", " def __str__(self):\n", " return \"KingScheduler(initial_lr={}, patience={}, decay={})\".format(\n", " self.initial_lr, self.patience, self.decay\n", " )\n", "\n", " def _steps_without_decrease(self, robust=False, check_all=False):\n", " \"\"\"\n", " Returns the maximum number of timesteps for which `P(loss is decreasing)\n", " < 0.51`.\n", "\n", " Parameters\n", " ----------\n", " robust : bool\n", " If `robust=True`, first filter out the largest 10% of the loss\n", " values to remove transient spikes in the loss due to, e.g., a few\n", " bad minibatches. Default is False.\n", " check_all : bool\n", " If False, returns the maximum number of timesteps for which P(loss\n", " is decreasing) < 0.51. If True, only checks whether the number of\n", " timesteps for which P(loss is decreasing) < 0.51 is equal to\n", " ``self.patience``. The former provides more information but is\n", " significantly more computationally expensive. Default is False.\n", "\n", " Returns\n", " -------\n", " steps_without_decrease: int\n", " The maximum number of steps back in loss_history for which P(loss\n", " is decreasing) < 0.51.\n", " \"\"\"\n", " lh = np.array(self.loss_history)\n", "\n", " # drop top 10% of loss values to filter out large loss spikes\n", " if robust:\n", " thresh = np.quantile(lh, 0.9)\n", " lh = np.array([i for i in lh if i <= thresh])\n", "\n", " N = len(lh)\n", " steps_without_decrease = 0\n", " if check_all:\n", " for i in reversed(range(N - 2)):\n", " if self._p_decreasing(lh, i) < 0.51:\n", " steps_without_decrease = N - i\n", " else:\n", " i = max(0, N - self.patience - 1)\n", " if self._p_decreasing(lh, i) < 0.51:\n", " steps_without_decrease = N - i\n", " return steps_without_decrease\n", "\n", " def _p_decreasing(self, loss_history, i):\n", " \"\"\"\n", " Compute the probability that the slope of the OLS fit to the loss\n", " history is negative.\n", "\n", " Parameters\n", " ----------\n", " loss_history : numpy array of shape (N,)\n", " The sequence of loss values for the previous `N` minibatches.\n", " i : int\n", " Compute P(Slope < 0) beginning at index i in `history`.\n", "\n", " Returns\n", " ------\n", " p_decreasing : float\n", " The probability that the slope of the OLS fit to loss_history is\n", " less than or equal to 0.\n", " \"\"\"\n", " loss = loss_history[i:]\n", " N = len(loss)\n", "\n", " # perform OLS on the loss entries to calc the slope mean\n", " X = np.c_[np.ones(N), np.arange(i, len(loss_history))]\n", " intercept, s_mean = np.linalg.inv(X.T @ X) @ X.T @ loss\n", " loss_pred = s_mean * X[:, 1] + intercept\n", "\n", " # compute the variance of our loss predictions and use this to compute\n", " # the (unbiased) estimate of the slope variance\n", " loss_var = 1 / (N - 2) * np.sum((loss - loss_pred) ** 2)\n", " s_var = (12 * loss_var) / (N ** 3 - N)\n", "\n", " # compute the probability that a random sample from a Gaussian\n", " # parameterized by s_mean and s_var is less than or equal to 0\n", " p_decreasing = gaussian_cdf(0, s_mean, s_var)\n", " return p_decreasing\n", "\n", " def learning_rate(self, step, cur_loss):\n", " \"\"\"\n", " Compute the updated learning rate for the current step and loss.\n", "\n", " Parameters\n", " ----------\n", " step : int\n", " The current step number. Unused.\n", " cur_loss : float\n", " The loss at the current step.\n", "\n", " Returns\n", " -------\n", " lr : float\n", " The learning rate for the current step.\n", " \"\"\"\n", " if cur_loss is None:\n", " raise ValueError(\"cur_loss must be a float, but got {}\".format(cur_loss))\n", "\n", " # this happens if we initialize the scheduler from a string / dict\n", " if not hasattr(self, \"max_history\"):\n", " self.max_history = np.ceil(1.1 * (self.patience + 1)).astype(int)\n", " patience, max_history = self.patience, self.max_history\n", "\n", " self.loss_history.append(cur_loss)\n", " if len(self.loss_history) < patience:\n", " return self.current_lr\n", " self.loss_history = self.loss_history[-max_history:]\n", "\n", " # if the loss has not decreased for `patience` timesteps, drop the\n", " # learning rate\n", " if (\n", " self._steps_without_decrease() > patience\n", " and self._steps_without_decrease(robust=True) > patience\n", " ):\n", " self.current_lr *= self.decay\n", "\n", " return self.current_lr\n"]} {"path": "numpy_ml/neural_nets/initializers/__init__.py", "content": ["from .initializers import *\n"]} {"path": "numpy_ml/neural_nets/initializers/initializers.py", "content": ["\"\"\"A module containing objects to instantiate various neural network components.\"\"\"\n", "import re\n", "from functools import partial\n", "from ast import literal_eval as _eval\n", "\n", "import numpy as np\n", "\n", "from ..optimizers import OptimizerBase, SGD, AdaGrad, RMSProp, Adam\n", "from ..activations import (\n", " ELU,\n", " GELU,\n", " SELU,\n", " ReLU,\n", " Tanh,\n", " Affine,\n", " Sigmoid,\n", " Identity,\n", " SoftPlus,\n", " LeakyReLU,\n", " Exponential,\n", " HardSigmoid,\n", " ActivationBase,\n", ")\n", "from ..schedulers import (\n", " SchedulerBase,\n", " ConstantScheduler,\n", " ExponentialScheduler,\n", " NoamScheduler,\n", " KingScheduler,\n", ")\n", "\n", "from ..utils import (\n", " he_normal,\n", " he_uniform,\n", " glorot_normal,\n", " glorot_uniform,\n", " truncated_normal,\n", ")\n", "\n", "\n", "class ActivationInitializer(object):\n", " def __init__(self, param=None):\n", " \"\"\"\n", " A class for initializing activation functions. Valid `param` values\n", " are:\n", " (a) ``__str__`` representations of an `ActivationBase` instance\n", " (b) `ActivationBase` instance\n", "\n", " If `param` is `None`, return the identity function: f(X) = X\n", " \"\"\"\n", " self.param = param\n", "\n", " def __call__(self):\n", " \"\"\"Initialize activation function\"\"\"\n", " param = self.param\n", " if param is None:\n", " act = Identity()\n", " elif isinstance(param, ActivationBase):\n", " act = param\n", " elif isinstance(param, str):\n", " act = self.init_from_str(param)\n", " else:\n", " raise ValueError(\"Unknown activation: {}\".format(param))\n", " return act\n", "\n", " def init_from_str(self, act_str):\n", " \"\"\"Initialize activation function from the `param` string\"\"\"\n", " act_str = act_str.lower()\n", " if act_str == \"relu\":\n", " act_fn = ReLU()\n", " elif act_str == \"tanh\":\n", " act_fn = Tanh()\n", " elif act_str == \"selu\":\n", " act_fn = SELU()\n", " elif act_str == \"sigmoid\":\n", " act_fn = Sigmoid()\n", " elif act_str == \"identity\":\n", " act_fn = Identity()\n", " elif act_str == \"hardsigmoid\":\n", " act_fn = HardSigmoid()\n", " elif act_str == \"softplus\":\n", " act_fn = SoftPlus()\n", " elif act_str == \"exponential\":\n", " act_fn = Exponential()\n", " elif \"affine\" in act_str:\n", " r = r\"affine\\(slope=(.*), intercept=(.*)\\)\"\n", " slope, intercept = re.match(r, act_str).groups()\n", " act_fn = Affine(float(slope), float(intercept))\n", " elif \"leaky relu\" in act_str:\n", " r = r\"leaky relu\\(alpha=(.*)\\)\"\n", " alpha = re.match(r, act_str).groups()[0]\n", " act_fn = LeakyReLU(float(alpha))\n", " elif \"gelu\" in act_str:\n", " r = r\"gelu\\(approximate=(.*)\\)\"\n", " approx = re.match(r, act_str).groups()[0] == \"true\"\n", " act_fn = GELU(approximation=approx)\n", " elif \"elu\" in act_str:\n", " r = r\"elu\\(alpha=(.*)\\)\"\n", " approx = re.match(r, act_str).groups()[0]\n", " act_fn = ELU(alpha=float(alpha))\n", " else:\n", " raise ValueError(\"Unknown activation: {}\".format(act_str))\n", " return act_fn\n", "\n", "\n", "class SchedulerInitializer(object):\n", " def __init__(self, param=None, lr=None):\n", " \"\"\"\n", " A class for initializing learning rate schedulers. Valid `param` values\n", " are:\n", " (a) __str__ representations of `SchedulerBase` instances\n", " (b) `SchedulerBase` instances\n", " (c) Parameter dicts (e.g., as produced via the `summary` method in\n", " `LayerBase` instances)\n", "\n", " If `param` is `None`, return the ConstantScheduler with learning rate\n", " equal to `lr`.\n", " \"\"\"\n", " if all([lr is None, param is None]):\n", " raise ValueError(\"lr and param cannot both be `None`\")\n", "\n", " self.lr = lr\n", " self.param = param\n", "\n", " def __call__(self):\n", " \"\"\"Initialize scheduler\"\"\"\n", " param = self.param\n", " if param is None:\n", " scheduler = ConstantScheduler(self.lr)\n", " elif isinstance(param, SchedulerBase):\n", " scheduler = param\n", " elif isinstance(param, str):\n", " scheduler = self.init_from_str()\n", " elif isinstance(param, dict):\n", " scheduler = self.init_from_dict()\n", " return scheduler\n", "\n", " def init_from_str(self):\n", " \"\"\"Initialize scheduler from the param string\"\"\"\n", " r = r\"([a-zA-Z]*)=([^,)]*)\"\n", " sch_str = self.param.lower()\n", " kwargs = {i: _eval(j) for i, j in re.findall(r, sch_str)}\n", "\n", " if \"constant\" in sch_str:\n", " scheduler = ConstantScheduler(**kwargs)\n", " elif \"exponential\" in sch_str:\n", " scheduler = ExponentialScheduler(**kwargs)\n", " elif \"noam\" in sch_str:\n", " scheduler = NoamScheduler(**kwargs)\n", " elif \"king\" in sch_str:\n", " scheduler = KingScheduler(**kwargs)\n", " else:\n", " raise NotImplementedError(\"{}\".format(sch_str))\n", " return scheduler\n", "\n", " def init_from_dict(self):\n", " \"\"\"Initialize scheduler from the param dictionary\"\"\"\n", " S = self.param\n", " sc = S[\"hyperparameters\"] if \"hyperparameters\" in S else None\n", "\n", " if sc is None:\n", " raise ValueError(\"Must have `hyperparameters` key: {}\".format(S))\n", "\n", " if sc and sc[\"id\"] == \"ConstantScheduler\":\n", " scheduler = ConstantScheduler()\n", " elif sc and sc[\"id\"] == \"ExponentialScheduler\":\n", " scheduler = ExponentialScheduler()\n", " elif sc and sc[\"id\"] == \"NoamScheduler\":\n", " scheduler = NoamScheduler()\n", " elif sc:\n", " raise NotImplementedError(\"{}\".format(sc[\"id\"]))\n", " scheduler.set_params(sc)\n", " return scheduler\n", "\n", "\n", "class OptimizerInitializer(object):\n", " def __init__(self, param=None):\n", " \"\"\"\n", " A class for initializing optimizers. Valid `param` values are:\n", " (a) __str__ representations of `OptimizerBase` instances\n", " (b) `OptimizerBase` instances\n", " (c) Parameter dicts (e.g., as produced via the `summary` method in\n", " `LayerBase` instances)\n", "\n", " If `param` is `None`, return the SGD optimizer with default parameters.\n", " \"\"\"\n", " self.param = param\n", "\n", " def __call__(self):\n", " \"\"\"Initialize the optimizer\"\"\"\n", " param = self.param\n", " if param is None:\n", " opt = SGD()\n", " elif isinstance(param, OptimizerBase):\n", " opt = param\n", " elif isinstance(param, str):\n", " opt = self.init_from_str()\n", " elif isinstance(param, dict):\n", " opt = self.init_from_dict()\n", " return opt\n", "\n", " def init_from_str(self):\n", " \"\"\"Initialize optimizer from the `param` string\"\"\"\n", " r = r\"([a-zA-Z]*)=([^,)]*)\"\n", " opt_str = self.param.lower()\n", " kwargs = {i: _eval(j) for i, j in re.findall(r, opt_str)}\n", " if \"sgd\" in opt_str:\n", " optimizer = SGD(**kwargs)\n", " elif \"adagrad\" in opt_str:\n", " optimizer = AdaGrad(**kwargs)\n", " elif \"rmsprop\" in opt_str:\n", " optimizer = RMSProp(**kwargs)\n", " elif \"adam\" in opt_str:\n", " optimizer = Adam(**kwargs)\n", " else:\n", " raise NotImplementedError(\"{}\".format(opt_str))\n", " return optimizer\n", "\n", " def init_from_dict(self):\n", " \"\"\"Initialize optimizer from the `param` dictonary\"\"\"\n", " D = self.param\n", " cc = D[\"cache\"] if \"cache\" in D else None\n", " op = D[\"hyperparameters\"] if \"hyperparameters\" in D else None\n", "\n", " if op is None:\n", " raise ValueError(\"`param` dictionary has no `hyperparemeters` key\")\n", "\n", " if op and op[\"id\"] == \"SGD\":\n", " optimizer = SGD()\n", " elif op and op[\"id\"] == \"RMSProp\":\n", " optimizer = RMSProp()\n", " elif op and op[\"id\"] == \"AdaGrad\":\n", " optimizer = AdaGrad()\n", " elif op and op[\"id\"] == \"Adam\":\n", " optimizer = Adam()\n", " elif op:\n", " raise NotImplementedError(\"{}\".format(op[\"id\"]))\n", " optimizer.set_params(op, cc)\n", " return optimizer\n", "\n", "\n", "class WeightInitializer(object):\n", " def __init__(self, act_fn_str, mode=\"glorot_uniform\"):\n", " \"\"\"\n", " A factory for weight initializers.\n", "\n", " Parameters\n", " ----------\n", " act_fn_str : str\n", " The string representation for the layer activation function\n", " mode : str (default: 'glorot_uniform')\n", " The weight initialization strategy. Valid entries are {\"he_normal\",\n", " \"he_uniform\", \"glorot_normal\", glorot_uniform\", \"std_normal\",\n", " \"trunc_normal\"}\n", " \"\"\"\n", " if mode not in [\n", " \"he_normal\",\n", " \"he_uniform\",\n", " \"glorot_normal\",\n", " \"glorot_uniform\",\n", " \"std_normal\",\n", " \"trunc_normal\",\n", " ]:\n", " raise ValueError(\"Unrecognize initialization mode: {}\".format(mode))\n", "\n", " self.mode = mode\n", " self.act_fn = act_fn_str\n", "\n", " if mode == \"glorot_uniform\":\n", " self._fn = glorot_uniform\n", " elif mode == \"glorot_normal\":\n", " self._fn = glorot_normal\n", " elif mode == \"he_uniform\":\n", " self._fn = he_uniform\n", " elif mode == \"he_normal\":\n", " self._fn = he_normal\n", " elif mode == \"std_normal\":\n", " self._fn = np.random.randn\n", " elif mode == \"trunc_normal\":\n", " self._fn = partial(truncated_normal, mean=0, std=1)\n", "\n", " def __call__(self, weight_shape):\n", " \"\"\"Initialize weights according to the specified strategy\"\"\"\n", " if \"glorot\" in self.mode:\n", " gain = self._calc_glorot_gain()\n", " W = self._fn(weight_shape, gain)\n", " elif self.mode == \"std_normal\":\n", " W = self._fn(*weight_shape)\n", " else:\n", " W = self._fn(weight_shape)\n", " return W\n", "\n", " def _calc_glorot_gain(self):\n", " \"\"\"\n", " Values from:\n", " https://pytorch.org/docs/stable/nn.html?#torch.nn.init.calculate_gain\n", " \"\"\"\n", " gain = 1.0\n", " act_str = self.act_fn.lower()\n", " if act_str == \"tanh\":\n", " gain = 5.0 / 3.0\n", " elif act_str == \"relu\":\n", " gain = np.sqrt(2)\n", " elif \"leaky relu\" in act_str:\n", " r = r\"leaky relu\\(alpha=(.*)\\)\"\n", " alpha = re.match(r, act_str).groups()[0]\n", " gain = np.sqrt(2 / 1 + float(alpha) ** 2)\n", " return gain\n"]} {"path": "numpy_ml/trees/rf.py", "content": ["import numpy as np\n", "from .dt import DecisionTree\n", "\n", "\n", "def bootstrap_sample(X, Y):\n", " N, M = X.shape\n", " idxs = np.random.choice(N, N, replace=True)\n", " return X[idxs], Y[idxs]\n", "\n", "\n", "class RandomForest:\n", " def __init__(\n", " self, n_trees, max_depth, n_feats, classifier=True, criterion=\"entropy\"\n", " ):\n", " \"\"\"\n", " An ensemble (forest) of decision trees where each split is calculated\n", " using a random subset of the features in the input.\n", "\n", " Parameters\n", " ----------\n", " n_trees : int\n", " The number of individual decision trees to use within the ensemble.\n", " max_depth: int or None\n", " The depth at which to stop growing each decision tree. If None,\n", " grow each tree until the leaf nodes are pure.\n", " n_feats : int\n", " The number of features to sample on each split.\n", " classifier : bool\n", " Whether `Y` contains class labels or real-valued targets. Default\n", " is True.\n", " criterion : {'entropy', 'gini', 'mse'}\n", " The error criterion to use when calculating splits for each weak\n", " learner. When ``classifier = False``, valid entries are {'mse'}.\n", " When ``classifier = True``, valid entries are {'entropy', 'gini'}.\n", " Default is 'entropy'.\n", " \"\"\"\n", " self.trees = []\n", " self.n_trees = n_trees\n", " self.n_feats = n_feats\n", " self.max_depth = max_depth\n", " self.criterion = criterion\n", " self.classifier = classifier\n", "\n", " def fit(self, X, Y):\n", " \"\"\"\n", " Create `n_trees`-worth of bootstrapped samples from the training data\n", " and use each to fit a separate decision tree.\n", " \"\"\"\n", " self.trees = []\n", " for _ in range(self.n_trees):\n", " X_samp, Y_samp = bootstrap_sample(X, Y)\n", " tree = DecisionTree(\n", " n_feats=self.n_feats,\n", " max_depth=self.max_depth,\n", " criterion=self.criterion,\n", " classifier=self.classifier,\n", " )\n", " tree.fit(X_samp, Y_samp)\n", " self.trees.append(tree)\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Predict the target value for each entry in `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " The training data of `N` examples, each with `M` features.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(N,)`\n", " Model predictions for each entry in `X`.\n", " \"\"\"\n", " tree_preds = np.array([[t._traverse(x, t.root) for x in X] for t in self.trees])\n", " return self._vote(tree_preds)\n", "\n", " def _vote(self, predictions):\n", " \"\"\"\n", " Return the aggregated prediction across all trees in the RF for each problem.\n", "\n", " Parameters\n", " ----------\n", " predictions : :py:class:`ndarray ` of shape `(n_trees, N)`\n", " The array of predictions from each decision tree in the RF for each\n", " of the `N` problems in `X`.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(N,)`\n", " If classifier is True, the class label predicted by the majority of\n", " the decision trees for each problem in `X`. If classifier is False,\n", " the average prediction across decision trees on each problem.\n", " \"\"\"\n", " if self.classifier:\n", " out = [np.bincount(x).argmax() for x in predictions.T]\n", " else:\n", " out = [np.mean(x) for x in predictions.T]\n", " return np.array(out)\n"]} {"path": "numpy_ml/trees/dt.py", "content": ["import numpy as np\n", "\n", "\n", "class Node:\n", " def __init__(self, left, right, rule):\n", " self.left = left\n", " self.right = right\n", " self.feature = rule[0]\n", " self.threshold = rule[1]\n", "\n", "\n", "class Leaf:\n", " def __init__(self, value):\n", " \"\"\"\n", " `value` is an array of class probabilities if classifier is True, else\n", " the mean of the region\n", " \"\"\"\n", " self.value = value\n", "\n", "\n", "class DecisionTree:\n", " def __init__(\n", " self,\n", " classifier=True,\n", " max_depth=None,\n", " n_feats=None,\n", " criterion=\"entropy\",\n", " seed=None,\n", " ):\n", " \"\"\"\n", " A decision tree model for regression and classification problems.\n", "\n", " Parameters\n", " ----------\n", " classifier : bool\n", " Whether to treat target values as categorical (classifier =\n", " True) or continuous (classifier = False). Default is True.\n", " max_depth: int or None\n", " The depth at which to stop growing the tree. If None, grow the tree\n", " until all leaves are pure. Default is None.\n", " n_feats : int\n", " Specifies the number of features to sample on each split. If None,\n", " use all features on each split. Default is None.\n", " criterion : {'mse', 'entropy', 'gini'}\n", " The error criterion to use when calculating splits. When\n", " `classifier` is False, valid entries are {'mse'}. When `classifier`\n", " is True, valid entries are {'entropy', 'gini'}. Default is\n", " 'entropy'.\n", " seed : int or None\n", " Seed for the random number generator. Default is None.\n", " \"\"\"\n", " if seed:\n", " np.random.seed(seed)\n", "\n", " self.depth = 0\n", " self.root = None\n", "\n", " self.n_feats = n_feats\n", " self.criterion = criterion\n", " self.classifier = classifier\n", " self.max_depth = max_depth if max_depth else np.inf\n", "\n", " if not classifier and criterion in [\"gini\", \"entropy\"]:\n", " raise ValueError(\n", " \"{} is a valid criterion only when classifier = True.\".format(criterion)\n", " )\n", " if classifier and criterion == \"mse\":\n", " raise ValueError(\"`mse` is a valid criterion only when classifier = False.\")\n", "\n", " def fit(self, X, Y):\n", " \"\"\"\n", " Fit a binary decision tree to a dataset.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " The training data of `N` examples, each with `M` features\n", " Y : :py:class:`ndarray ` of shape `(N,)`\n", " An array of integer class labels for each example in `X` if\n", " self.classifier = True, otherwise the set of target values for\n", " each example in `X`.\n", " \"\"\"\n", " self.n_classes = max(Y) + 1 if self.classifier else None\n", " self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])\n", " self.root = self._grow(X, Y)\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Use the trained decision tree to classify or predict the examples in `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " The training data of `N` examples, each with `M` features\n", "\n", " Returns\n", " -------\n", " preds : :py:class:`ndarray ` of shape `(N,)`\n", " The integer class labels predicted for each example in `X` if\n", " self.classifier = True, otherwise the predicted target values.\n", " \"\"\"\n", " return np.array([self._traverse(x, self.root) for x in X])\n", "\n", " def predict_class_probs(self, X):\n", " \"\"\"\n", " Use the trained decision tree to return the class probabilities for the\n", " examples in `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " The training data of `N` examples, each with `M` features\n", "\n", " Returns\n", " -------\n", " preds : :py:class:`ndarray ` of shape `(N, n_classes)`\n", " The class probabilities predicted for each example in `X`.\n", " \"\"\"\n", " assert self.classifier, \"`predict_class_probs` undefined for classifier = False\"\n", " return np.array([self._traverse(x, self.root, prob=True) for x in X])\n", "\n", " def _grow(self, X, Y, cur_depth=0):\n", " # if all labels are the same, return a leaf\n", " if len(set(Y)) == 1:\n", " if self.classifier:\n", " prob = np.zeros(self.n_classes)\n", " prob[Y[0]] = 1.0\n", " return Leaf(prob) if self.classifier else Leaf(Y[0])\n", "\n", " # if we have reached max_depth, return a leaf\n", " if cur_depth >= self.max_depth:\n", " v = np.mean(Y, axis=0)\n", " if self.classifier:\n", " v = np.bincount(Y, minlength=self.n_classes) / len(Y)\n", " return Leaf(v)\n", "\n", " cur_depth += 1\n", " self.depth = max(self.depth, cur_depth)\n", "\n", " N, M = X.shape\n", " feat_idxs = np.random.choice(M, self.n_feats, replace=False)\n", "\n", " # greedily select the best split according to `criterion`\n", " feat, thresh = self._segment(X, Y, feat_idxs)\n", " l = np.argwhere(X[:, feat] <= thresh).flatten()\n", " r = np.argwhere(X[:, feat] > thresh).flatten()\n", "\n", " # grow the children that result from the split\n", " left = self._grow(X[l, :], Y[l], cur_depth)\n", " right = self._grow(X[r, :], Y[r], cur_depth)\n", " return Node(left, right, (feat, thresh))\n", "\n", " def _segment(self, X, Y, feat_idxs):\n", " \"\"\"\n", " Find the optimal split rule (feature index and split threshold) for the\n", " data according to `self.criterion`.\n", " \"\"\"\n", " best_gain = -np.inf\n", " split_idx, split_thresh = None, None\n", " for i in feat_idxs:\n", " vals = X[:, i]\n", " levels = np.unique(vals)\n", " thresholds = (levels[:-1] + levels[1:]) / 2 if len(levels) > 1 else levels\n", " gains = np.array([self._impurity_gain(Y, t, vals) for t in thresholds])\n", "\n", " if gains.max() > best_gain:\n", " split_idx = i\n", " best_gain = gains.max()\n", " split_thresh = thresholds[gains.argmax()]\n", "\n", " return split_idx, split_thresh\n", "\n", " def _impurity_gain(self, Y, split_thresh, feat_values):\n", " \"\"\"\n", " Compute the impurity gain associated with a given split.\n", "\n", " IG(split) = loss(parent) - weighted_avg[loss(left_child), loss(right_child)]\n", " \"\"\"\n", " if self.criterion == \"entropy\":\n", " loss = entropy\n", " elif self.criterion == \"gini\":\n", " loss = gini\n", " elif self.criterion == \"mse\":\n", " loss = mse\n", "\n", " parent_loss = loss(Y)\n", "\n", " # generate split\n", " left = np.argwhere(feat_values <= split_thresh).flatten()\n", " right = np.argwhere(feat_values > split_thresh).flatten()\n", "\n", " if len(left) == 0 or len(right) == 0:\n", " return 0\n", "\n", " # compute the weighted avg. of the loss for the children\n", " n = len(Y)\n", " n_l, n_r = len(left), len(right)\n", " e_l, e_r = loss(Y[left]), loss(Y[right])\n", " child_loss = (n_l / n) * e_l + (n_r / n) * e_r\n", "\n", " # impurity gain is difference in loss before vs. after split\n", " ig = parent_loss - child_loss\n", " return ig\n", "\n", " def _traverse(self, X, node, prob=False):\n", " if isinstance(node, Leaf):\n", " if self.classifier:\n", " return node.value if prob else node.value.argmax()\n", " return node.value\n", " if X[node.feature] <= node.threshold:\n", " return self._traverse(X, node.left, prob)\n", " return self._traverse(X, node.right, prob)\n", "\n", "\n", "def mse(y):\n", " \"\"\"\n", " Mean squared error for decision tree (ie., mean) predictions\n", " \"\"\"\n", " return np.mean((y - np.mean(y)) ** 2)\n", "\n", "\n", "def entropy(y):\n", " \"\"\"\n", " Entropy of a label sequence\n", " \"\"\"\n", " hist = np.bincount(y)\n", " ps = hist / np.sum(hist)\n", " return -np.sum([p * np.log2(p) for p in ps if p > 0])\n", "\n", "\n", "def gini(y):\n", " \"\"\"\n", " Gini impurity (local entropy) of a label sequence\n", " \"\"\"\n", " hist = np.bincount(y)\n", " N = np.sum(hist)\n", " return 1 - sum([(i / N) ** 2 for i in hist])\n"]} {"path": "numpy_ml/trees/__init__.py", "content": ["from . import losses\n", "from .dt import *\n", "from .rf import *\n", "from .gbdt import *\n"]} {"path": "numpy_ml/trees/losses.py", "content": ["import numpy as np\n", "\n", "#######################################################################\n", "# Base Estimators #\n", "#######################################################################\n", "\n", "\n", "class ClassProbEstimator:\n", " def fit(self, X, y):\n", " self.class_prob = y.sum() / len(y)\n", "\n", " def predict(self, X):\n", " pred = np.empty(X.shape[0], dtype=np.float64)\n", " pred.fill(self.class_prob)\n", " return pred\n", "\n", "\n", "class MeanBaseEstimator:\n", " def fit(self, X, y):\n", " self.avg = np.mean(y)\n", "\n", " def predict(self, X):\n", " pred = np.empty(X.shape[0], dtype=np.float64)\n", " pred.fill(self.avg)\n", " return pred\n", "\n", "\n", "#######################################################################\n", "# Loss Functions #\n", "#######################################################################\n", "\n", "\n", "class MSELoss:\n", " def __call__(self, y, y_pred):\n", " return np.mean((y - y_pred) ** 2)\n", "\n", " def base_estimator(self):\n", " return MeanBaseEstimator()\n", "\n", " def grad(self, y, y_pred):\n", " return -2 / len(y) * (y - y_pred)\n", "\n", " def line_search(self, y, y_pred, h_pred):\n", " # TODO: revise this\n", " Lp = np.sum((y - y_pred) * h_pred)\n", " Lpp = np.sum(h_pred * h_pred)\n", "\n", " # if we perfectly fit the residuals, use max step size\n", " return 1 if np.sum(Lpp) == 0 else Lp / Lpp\n", "\n", "\n", "class CrossEntropyLoss:\n", " def __call__(self, y, y_pred):\n", " eps = np.finfo(float).eps\n", " return -np.sum(y * np.log(y_pred + eps))\n", "\n", " def base_estimator(self):\n", " return ClassProbEstimator()\n", "\n", " def grad(self, y, y_pred):\n", " eps = np.finfo(float).eps\n", " return -y * 1 / (y_pred + eps)\n", "\n", " def line_search(self, y, y_pred, h_pred):\n", " raise NotImplementedError\n"]} {"path": "numpy_ml/trees/gbdt.py", "content": ["import numpy as np\n", "\n", "from .dt import DecisionTree\n", "from .losses import MSELoss, CrossEntropyLoss\n", "\n", "\n", "def to_one_hot(labels, n_classes=None):\n", " if labels.ndim > 1:\n", " raise ValueError(\"labels must have dimension 1, but got {}\".format(labels.ndim))\n", "\n", " N = labels.size\n", " n_cols = np.max(labels) + 1 if n_classes is None else n_classes\n", " one_hot = np.zeros((N, n_cols))\n", " one_hot[np.arange(N), labels] = 1.0\n", " return one_hot\n", "\n", "\n", "class GradientBoostedDecisionTree:\n", " def __init__(\n", " self,\n", " n_iter,\n", " max_depth=None,\n", " classifier=True,\n", " learning_rate=1,\n", " loss=\"crossentropy\",\n", " step_size=\"constant\",\n", " ):\n", " \"\"\"\n", " A gradient boosted ensemble of decision trees.\n", "\n", " Notes\n", " -----\n", " Gradient boosted machines (GBMs) fit an ensemble of `m` weak learners such that:\n", "\n", " .. math::\n", "\n", " f_m(X) = b(X) + \\eta w_1 g_1 + \\ldots + \\eta w_m g_m\n", "\n", " where `b` is a fixed initial estimate for the targets, :math:`\\eta` is\n", " a learning rate parameter, and :math:`w_{\\cdot}` and :math:`g_{\\cdot}`\n", " denote the weights and learner predictions for subsequent fits.\n", "\n", " We fit each `w` and `g` iteratively using a greedy strategy so that at each\n", " iteration `i`,\n", "\n", " .. math::\n", "\n", " w_i, g_i = \\\\arg \\min_{w_i, g_i} L(Y, f_{i-1}(X) + w_i g_i)\n", "\n", " On each iteration we fit a new weak learner to predict the negative\n", " gradient of the loss with respect to the previous prediction, :math:`f_{i-1}(X)`.\n", " We then use the element-wise product of the predictions of this weak\n", " learner, :math:`g_i`, with a weight, :math:`w_i`, to compute the amount to\n", " adjust the predictions of our model at the previous iteration, :math:`f_{i-1}(X)`:\n", "\n", " .. math::\n", "\n", " f_i(X) := f_{i-1}(X) + w_i g_i\n", "\n", " Parameters\n", " ----------\n", " n_iter : int\n", " The number of iterations / weak estimators to use when fitting each\n", " dimension / class of `Y`.\n", " max_depth : int\n", " The maximum depth of each decision tree weak estimator. Default is\n", " None.\n", " classifier : bool\n", " Whether `Y` contains class labels or real-valued targets. Default\n", " is True.\n", " learning_rate : float\n", " Value in [0, 1] controlling the amount each weak estimator\n", " contributes to the overall model prediction. Sometimes known as the\n", " `shrinkage parameter` in the GBM literature. Default is 1.\n", " loss : {'crossentropy', 'mse'}\n", " The loss to optimize for the GBM. Default is 'crossentropy'.\n", " step_size : {\"constant\", \"adaptive\"}\n", " How to choose the weight for each weak learner. If \"constant\", use\n", " a fixed weight of 1 for each learner. If \"adaptive\", use a step\n", " size computed via line-search on the current iteration's loss.\n", " Default is 'constant'.\n", " \"\"\"\n", " self.loss = loss\n", " self.weights = None\n", " self.learners = None\n", " self.out_dims = None\n", " self.n_iter = n_iter\n", " self.base_estimator = None\n", " self.max_depth = max_depth\n", " self.step_size = step_size\n", " self.classifier = classifier\n", " self.learning_rate = learning_rate\n", "\n", " def fit(self, X, Y):\n", " \"\"\"\n", " Fit the gradient boosted decision trees on a dataset.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape (N, M)\n", " The training data of `N` examples, each with `M` features\n", " Y : :py:class:`ndarray ` of shape (N,)\n", " An array of integer class labels for each example in `X` if\n", " ``self.classifier = True``, otherwise the set of target values for\n", " each example in `X`.\n", " \"\"\"\n", " if self.loss == \"mse\":\n", " loss = MSELoss()\n", " elif self.loss == \"crossentropy\":\n", " loss = CrossEntropyLoss()\n", "\n", " # convert Y to one_hot if not already\n", " if self.classifier:\n", " Y = to_one_hot(Y.flatten())\n", " else:\n", " Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y\n", "\n", " N, M = X.shape\n", " self.out_dims = Y.shape[1]\n", " self.learners = np.empty((self.n_iter, self.out_dims), dtype=object)\n", " self.weights = np.ones((self.n_iter, self.out_dims))\n", " self.weights[1:, :] *= self.learning_rate\n", "\n", " # fit the base estimator\n", " Y_pred = np.zeros((N, self.out_dims))\n", " for k in range(self.out_dims):\n", " t = loss.base_estimator()\n", " t.fit(X, Y[:, k])\n", " Y_pred[:, k] += t.predict(X)\n", " self.learners[0, k] = t\n", "\n", " # incrementally fit each learner on the negative gradient of the loss\n", " # wrt the previous fit (pseudo-residuals)\n", " for i in range(1, self.n_iter):\n", " for k in range(self.out_dims):\n", " y, y_pred = Y[:, k], Y_pred[:, k]\n", " neg_grad = -1 * loss.grad(y, y_pred)\n", "\n", " # use MSE as the surrogate loss when fitting to negative gradients\n", " t = DecisionTree(\n", " classifier=False, max_depth=self.max_depth, criterion=\"mse\"\n", " )\n", "\n", " # fit current learner to negative gradients\n", " t.fit(X, neg_grad)\n", " self.learners[i, k] = t\n", "\n", " # compute step size and weight for the current learner\n", " step = 1.0\n", " h_pred = t.predict(X)\n", " if self.step_size == \"adaptive\":\n", " step = loss.line_search(y, y_pred, h_pred)\n", "\n", " # update weights and our overall prediction for Y\n", " self.weights[i, k] *= step\n", " Y_pred[:, k] += self.weights[i, k] * h_pred\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Use the trained model to classify or predict the examples in `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " The training data of `N` examples, each with `M` features\n", "\n", " Returns\n", " -------\n", " preds : :py:class:`ndarray ` of shape `(N,)`\n", " The integer class labels predicted for each example in `X` if\n", " ``self.classifier = True``, otherwise the predicted target values.\n", " \"\"\"\n", " Y_pred = np.zeros((X.shape[0], self.out_dims))\n", " for i in range(self.n_iter):\n", " for k in range(self.out_dims):\n", " Y_pred[:, k] += self.weights[i, k] * self.learners[i, k].predict(X)\n", "\n", " if self.classifier:\n", " Y_pred = Y_pred.argmax(axis=1)\n", "\n", " return Y_pred\n"]} {"path": "numpy_ml/bandits/__init__.py", "content": ["from .bandits import *\n", "from . import policies\n", "from . import trainer\n"]} {"path": "numpy_ml/bandits/bandits.py", "content": ["\"\"\"A module containing different variations on multi-armed bandit environments.\"\"\"\n", "\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "from numpy_ml.utils.testing import random_one_hot_matrix, is_number\n", "\n", "\n", "class Bandit(ABC):\n", " def __init__(self, rewards, reward_probs, context=None):\n", " assert len(rewards) == len(reward_probs)\n", " self.step = 0\n", " self.n_arms = len(rewards)\n", "\n", " super().__init__()\n", "\n", " def __repr__(self):\n", " \"\"\"A string representation for the bandit\"\"\"\n", " HP = self.hyperparameters\n", " params = \", \".join([\"{}={}\".format(k, v) for (k, v) in HP.items() if k != \"id\"])\n", " return \"{}({})\".format(HP[\"id\"], params)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {}\n", "\n", " @abstractmethod\n", " def oracle_payoff(self, context=None):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " The current context matrix for each of the bandit arms, if\n", " applicable. Default is None.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " \"\"\"\n", " pass\n", "\n", " def pull(self, arm_id, context=None):\n", " \"\"\"\n", " \"Pull\" (i.e., sample from) a given arm's payoff distribution.\n", "\n", " Parameters\n", " ----------\n", " arm_id : int\n", " The integer ID of the arm to sample from\n", " context : :py:class:`ndarray ` of shape `(D,)` or None\n", " The context vector for the current timestep if this is a contextual\n", " bandit. Otherwise, this argument is unused and defaults to None.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The reward sampled from the given arm's payoff distribution\n", " \"\"\"\n", " assert arm_id < self.n_arms\n", "\n", " self.step += 1\n", " return self._pull(arm_id, context)\n", "\n", " def reset(self):\n", " \"\"\"Reset the bandit step and action counters to zero.\"\"\"\n", " self.step = 0\n", "\n", " @abstractmethod\n", " def _pull(self, arm_id):\n", " pass\n", "\n", "\n", "class MultinomialBandit(Bandit):\n", " def __init__(self, payoffs, payoff_probs):\n", " \"\"\"\n", " A multi-armed bandit where each arm is associated with a different\n", " multinomial payoff distribution.\n", "\n", " Parameters\n", " ----------\n", " payoffs : ragged list of length `K`\n", " The payoff values for each of the `n` bandits. ``payoffs[k][i]``\n", " holds the `i` th payoff value for arm `k`.\n", " payoff_probs : ragged list of length `K`\n", " A list of the probabilities associated with each of the payoff\n", " values in ``payoffs``. ``payoff_probs[k][i]`` holds the probability\n", " of payoff index `i` for arm `k`.\n", " \"\"\"\n", " super().__init__(payoffs, payoff_probs)\n", "\n", " for r, rp in zip(payoffs, payoff_probs):\n", " assert len(r) == len(rp)\n", " np.testing.assert_almost_equal(sum(rp), 1.0)\n", "\n", " payoffs = np.array([np.array(x) for x in payoffs])\n", " payoff_probs = np.array([np.array(x) for x in payoff_probs])\n", "\n", " self.payoffs = payoffs\n", " self.payoff_probs = payoff_probs\n", " self.arm_evs = np.array([sum(p * v) for p, v in zip(payoff_probs, payoffs)])\n", " self.best_ev = np.max(self.arm_evs)\n", " self.best_arm = np.argmax(self.arm_evs)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {\n", " \"id\": \"MultinomialBandit\",\n", " \"payoffs\": self.payoffs,\n", " \"payoff_probs\": self.payoff_probs,\n", " }\n", "\n", " def oracle_payoff(self, context=None):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " Unused. Default is None.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " optimal_arm : float\n", " The arm ID with the largest expected reward.\n", " \"\"\"\n", " return self.best_ev, self.best_arm\n", "\n", " def _pull(self, arm_id, context):\n", " payoffs = self.payoffs[arm_id]\n", " probs = self.payoff_probs[arm_id]\n", " return np.random.choice(payoffs, p=probs)\n", "\n", "\n", "class BernoulliBandit(Bandit):\n", " def __init__(self, payoff_probs):\n", " \"\"\"\n", " A multi-armed bandit where each arm is associated with an independent\n", " Bernoulli payoff distribution.\n", "\n", " Parameters\n", " ----------\n", " payoff_probs : list of length `K`\n", " A list of the payoff probability for each arm. ``payoff_probs[k]``\n", " holds the probability of payoff for arm `k`.\n", " \"\"\"\n", " payoffs = [1] * len(payoff_probs)\n", " super().__init__(payoffs, payoff_probs)\n", "\n", " for p in payoff_probs:\n", " assert p >= 0 and p <= 1\n", "\n", " self.payoffs = np.array(payoffs)\n", " self.payoff_probs = np.array(payoff_probs)\n", "\n", " self.arm_evs = self.payoff_probs\n", " self.best_ev = np.max(self.arm_evs)\n", " self.best_arm = np.argmax(self.arm_evs)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {\n", " \"id\": \"BernoulliBandit\",\n", " \"payoff_probs\": self.payoff_probs,\n", " }\n", "\n", " def oracle_payoff(self, context=None):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " Unused. Default is None.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " optimal_arm : float\n", " The arm ID with the largest expected reward.\n", " \"\"\"\n", " return self.best_ev, self.best_arm\n", "\n", " def _pull(self, arm_id, context):\n", " return int(np.random.rand() <= self.payoff_probs[arm_id])\n", "\n", "\n", "class GaussianBandit(Bandit):\n", " def __init__(self, payoff_dists, payoff_probs):\n", " \"\"\"\n", " A multi-armed bandit that is similar to\n", " :class:`BernoulliBandit`, but instead of each arm having\n", " a fixed payout of 1, the payoff values are sampled from independent\n", " Gaussian RVs.\n", "\n", " Parameters\n", " ----------\n", " payoff_dists : list of 2-tuples of length `K`\n", " The parameters the distributions over payoff values for each of the\n", " `n` arms. Specifically, ``payoffs[k]`` is a tuple of (mean, variance)\n", " for the Gaussian distribution over payoffs associated with arm `k`.\n", " payoff_probs : list of length `n`\n", " A list of the probabilities associated with each of the payoff\n", " values in ``payoffs``. ``payoff_probs[k]`` holds the probability of\n", " payoff for arm `k`.\n", " \"\"\"\n", " super().__init__(payoff_dists, payoff_probs)\n", "\n", " for (mean, var), rp in zip(payoff_dists, payoff_probs):\n", " assert var > 0\n", " assert np.testing.assert_almost_equal(sum(rp), 1.0)\n", "\n", " self.payoff_dists = payoff_dists\n", " self.payoff_probs = payoff_probs\n", " self.arm_evs = np.array([mu for (mu, var) in payoff_dists])\n", " self.best_ev = np.max(self.arm_evs)\n", " self.best_arm = np.argmax(self.arm_evs)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {\n", " \"id\": \"GaussianBandit\",\n", " \"payoff_dists\": self.payoff_dists,\n", " \"payoff_probs\": self.payoff_probs,\n", " }\n", "\n", " def _pull(self, arm_id, context):\n", " mean, var = self.payoff_dists[arm_id]\n", "\n", " reward = 0\n", " if np.random.rand() < self.payoff_probs[arm_id]:\n", " reward = np.random.normal(mean, var)\n", "\n", " return reward\n", "\n", " def oracle_payoff(self, context=None):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " Unused. Default is None.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " optimal_arm : float\n", " The arm ID with the largest expected reward.\n", " \"\"\"\n", " return self.best_ev, self.best_arm\n", "\n", "\n", "class ShortestPathBandit(Bandit):\n", " def __init__(self, G, start_vertex, end_vertex):\n", " \"\"\"\n", " A weighted graph shortest path problem formulated as a multi-armed\n", " bandit.\n", "\n", " Notes\n", " -----\n", " Each arm corresponds to a valid path through the graph from start to\n", " end vertex. The agent's goal is to find the path that minimizes the\n", " expected sum of the weights on the edges it traverses.\n", "\n", " Parameters\n", " ----------\n", " G : :class:`Graph ` instance\n", " A weighted graph object. Weights can be fixed or probabilistic.\n", " start_vertex : int\n", " The index of the path's start vertex in the graph\n", " end_vertex : int\n", " The index of the path's end vertex in the graph\n", " \"\"\"\n", " self.G = G\n", " self.end_vertex = end_vertex\n", " self.adj_dict = G.to_adj_dict()\n", " self.start_vertex = start_vertex\n", " self.paths = G.all_paths(start_vertex, end_vertex)\n", "\n", " self.arm_evs = self._calc_arm_evs()\n", " self.best_ev = np.max(self.arm_evs)\n", " self.best_arm = np.argmax(self.arm_evs)\n", "\n", " placeholder = [None] * len(self.paths)\n", " super().__init__(placeholder, placeholder)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {\n", " \"id\": \"ShortestPathBandit\",\n", " \"G\": self.G,\n", " \"end_vertex\": self.end_vertex,\n", " \"start_vertex\": self.start_vertex,\n", " }\n", "\n", " def oracle_payoff(self, context=None):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " Unused. Default is None.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " optimal_arm : float\n", " The arm ID with the largest expected reward.\n", " \"\"\"\n", " return self.best_ev, self.best_arm\n", "\n", " def _calc_arm_evs(self):\n", " I2V = self.G.get_vertex\n", " evs = np.zeros(len(self.paths))\n", " for p_ix, path in enumerate(self.paths):\n", " for ix, v_i in enumerate(path[:-1]):\n", " e = [e for e in self.adj_dict[v_i] if e.to == I2V(path[ix + 1])][0]\n", " evs[p_ix] -= e.weight\n", " return evs\n", "\n", " def _pull(self, arm_id, context):\n", " reward = 0\n", " I2V = self.G.get_vertex\n", " path = self.paths[arm_id]\n", " for ix, v_i in enumerate(path[:-1]):\n", " e = [e for e in self.adj_dict[v_i] if e.to == I2V(path[ix + 1])][0]\n", " reward -= e.weight\n", " return reward\n", "\n", "\n", "class ContextualBernoulliBandit(Bandit):\n", " def __init__(self, context_probs):\n", " \"\"\"\n", " A contextual version of :class:`BernoulliBandit` where each binary\n", " context feature is associated with an independent Bernoulli payoff\n", " distribution.\n", "\n", " Parameters\n", " ----------\n", " context_probs : :py:class:`ndarray ` of shape `(D, K)`\n", " A matrix of the payoff probabilities associated with each of the\n", " `D` context features, for each of the `K` arms. Index `(i, j)`\n", " contains the probability of payoff for arm `j` under context `i`.\n", " \"\"\"\n", " D, K = context_probs.shape\n", "\n", " # use a dummy placeholder variable to initialize the Bandit superclass\n", " placeholder = [None] * K\n", " super().__init__(placeholder, placeholder)\n", "\n", " self.context_probs = context_probs\n", " self.arm_evs = self.context_probs\n", " self.best_evs = self.arm_evs.max(axis=1)\n", " self.best_arms = self.arm_evs.argmax(axis=1)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {\n", " \"id\": \"ContextualBernoulliBandit\",\n", " \"context_probs\": self.context_probs,\n", " }\n", "\n", " def get_context(self):\n", " \"\"\"\n", " Sample a random one-hot context vector. This vector will be the same\n", " for all arms.\n", "\n", " Returns\n", " -------\n", " context : :py:class:`ndarray ` of shape `(D, K)`\n", " A random `D`-dimensional one-hot context vector repeated for each\n", " of the `K` bandit arms.\n", " \"\"\"\n", " D, K = self.context_probs.shape\n", " context = np.zeros((D, K))\n", " context[np.random.choice(D), :] = 1\n", " return random_one_hot_matrix(1, D).ravel()\n", "\n", " def oracle_payoff(self, context):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " The current context matrix for each of the bandit arms.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " optimal_arm : float\n", " The arm ID with the largest expected reward.\n", " \"\"\"\n", " context_id = context[:, 0].argmax()\n", " return self.best_evs[context_id], self.best_arms[context_id]\n", "\n", " def _pull(self, arm_id, context):\n", " D, K = self.context_probs.shape\n", " arm_probs = context[:, arm_id] @ self.context_probs\n", " arm_rwds = (np.random.rand(K) <= arm_probs).astype(int)\n", " return arm_rwds[arm_id]\n", "\n", "\n", "class ContextualLinearBandit(Bandit):\n", " def __init__(self, K, D, payoff_variance=1):\n", " r\"\"\"\n", " A contextual linear multi-armed bandit.\n", "\n", " Notes\n", " -----\n", " In a contextual linear bandit the expected payoff of an arm :math:`a\n", " \\in \\mathcal{A}` at time `t` is a linear combination of its context\n", " vector :math:`\\mathbf{x}_{t,a}` with a coefficient vector\n", " :math:`\\theta_a`:\n", "\n", " .. math::\n", "\n", " \\mathbb{E}[r_{t, a} \\mid \\mathbf{x}_{t, a}] = \\mathbf{x}_{t,a}^\\top \\theta_a\n", "\n", " In this implementation, the arm coefficient vectors :math:`\\theta` are\n", " initialized independently from a uniform distribution on the interval\n", " [-1, 1], and the specific reward at timestep `t` is normally\n", " distributed:\n", "\n", " .. math::\n", "\n", " r_{t, a} \\mid \\mathbf{x}_{t, a} \\sim\n", " \\mathcal{N}(\\mathbf{x}_{t,a}^\\top \\theta_a, \\sigma_a^2)\n", "\n", " Parameters\n", " ----------\n", " K : int\n", " The number of bandit arms\n", " D : int\n", " The dimensionality of the context vectors\n", " payoff_variance : float or :py:class:`ndarray ` of shape `(K,)`\n", " The variance of the random noise in the arm payoffs. If a float,\n", " the variance is assumed to be equal for each arm. Default is 1.\n", " \"\"\"\n", " if is_number(payoff_variance):\n", " payoff_variance = [payoff_variance] * K\n", "\n", " assert len(payoff_variance) == K\n", " assert all(v > 0 for v in payoff_variance)\n", "\n", " self.K = K\n", " self.D = D\n", " self.payoff_variance = payoff_variance\n", "\n", " # use a dummy placeholder variable to initialize the Bandit superclass\n", " placeholder = [None] * K\n", " super().__init__(placeholder, placeholder)\n", "\n", " # initialize the theta matrix\n", " self.thetas = np.random.uniform(-1, 1, size=(D, K))\n", " self.thetas /= np.linalg.norm(self.thetas, 2)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary of the bandit hyperparameters\"\"\"\n", " return {\n", " \"id\": \"ContextualLinearBandit\",\n", " \"K\": self.K,\n", " \"D\": self.D,\n", " \"payoff_variance\": self.payoff_variance,\n", " }\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary of the current bandit parameters\"\"\"\n", " return {\"thetas\": self.thetas}\n", "\n", " def get_context(self):\n", " \"\"\"\n", " Sample the context vectors for each arm from a multivariate standard\n", " normal distribution.\n", "\n", " Returns\n", " -------\n", " context : :py:class:`ndarray ` of shape `(D, K)`\n", " A `D`-dimensional context vector sampled from a standard normal\n", " distribution for each of the `K` bandit arms.\n", " \"\"\"\n", " return np.random.normal(size=(self.D, self.K))\n", "\n", " def oracle_payoff(self, context):\n", " \"\"\"\n", " Return the expected reward for an optimal agent.\n", "\n", " Parameters\n", " ----------\n", " context : :py:class:`ndarray ` of shape `(D, K)` or None\n", " The current context matrix for each of the bandit arms, if\n", " applicable. Default is None.\n", "\n", " Returns\n", " -------\n", " optimal_rwd : float\n", " The expected reward under an optimal policy.\n", " optimal_arm : float\n", " The arm ID with the largest expected reward.\n", " \"\"\"\n", " best_arm = np.argmax(self.arm_evs)\n", " return self.arm_evs[best_arm], best_arm\n", "\n", " def _pull(self, arm_id, context):\n", " K, thetas = self.K, self.thetas\n", " self._noise = np.random.normal(scale=self.payoff_variance, size=self.K)\n", " self.arm_evs = np.array([context[:, k] @ thetas[:, k] for k in range(K)])\n", " return (self.arm_evs + self._noise)[arm_id]\n"]} {"path": "numpy_ml/bandits/trainer.py", "content": ["\"\"\"A trainer/runner object for executing and comparing MAB policies.\"\"\"\n", "\n", "import warnings\n", "import os.path as op\n", "from collections import defaultdict\n", "\n", "import numpy as np\n", "\n", "from numpy_ml.utils.testing import DependencyWarning\n", "\n", "try:\n", " import matplotlib.pyplot as plt\n", "\n", " _PLOTTING = True\n", "except ImportError:\n", " fstr = \"Cannot import matplotlib. Plotting functionality disabled.\"\n", " warnings.warn(fstr, DependencyWarning)\n", " _PLOTTING = False\n", "\n", "\n", "def get_scriptdir():\n", " \"\"\"Return the directory containing the `trainer.py` script\"\"\"\n", " return op.dirname(op.realpath(__file__))\n", "\n", "\n", "def mse(bandit, policy):\n", " \"\"\"\n", " Computes the mean squared error between a policy's estimates of the\n", " expected arm payouts and the true expected payouts.\n", " \"\"\"\n", " if not hasattr(policy, \"ev_estimates\") or len(policy.ev_estimates) == 0:\n", " return np.nan\n", "\n", " se = []\n", " evs = bandit.arm_evs\n", " ests = sorted(policy.ev_estimates.items(), key=lambda x: x[0])\n", " for ix, (est, ev) in enumerate(zip(ests, evs)):\n", " se.append((est[1] - ev) ** 2)\n", " return np.mean(se)\n", "\n", "\n", "def smooth(prev, cur, weight):\n", " r\"\"\"\n", " Compute a simple weighted average of the previous and current value.\n", "\n", " Notes\n", " -----\n", " The smoothed value at timestep `t`, :math:`\\tilde{X}_t` is calculated as\n", "\n", " .. math::\n", "\n", " \\tilde{X}_t = \\epsilon \\tilde{X}_{t-1} + (1 - \\epsilon) X_t\n", "\n", " where :math:`X_t` is the value at timestep `t`, :math:`\\tilde{X}_{t-1}` is\n", " the value of the smoothed signal at timestep `t-1`, and :math:`\\epsilon` is\n", " the smoothing weight.\n", "\n", " Parameters\n", " ----------\n", " prev : float or :py:class:`ndarray ` of shape `(N,)`\n", " The value of the smoothed signal at the immediately preceding\n", " timestep.\n", " cur : float or :py:class:`ndarray ` of shape `(N,)`\n", " The value of the signal at the current timestep\n", " weight : float or :py:class:`ndarray ` of shape `(N,)`\n", " The smoothing weight. Values closer to 0 result in less smoothing,\n", " values closer to 1 produce more aggressive smoothing. If weight is an\n", " array, each dimension will be interpreted as a separate smoothing\n", " weight the corresponding dimension in `cur`.\n", "\n", " Returns\n", " -------\n", " smoothed : float or :py:class:`ndarray ` of shape `(N,)`\n", " The smoothed signal\n", " \"\"\"\n", " return weight * prev + (1 - weight) * cur\n", "\n", "\n", "class BanditTrainer:\n", " def __init__(self):\n", " \"\"\"\n", " An object to facilitate multi-armed bandit training, comparison, and\n", " evaluation.\n", " \"\"\"\n", " self.logs = {}\n", "\n", " def compare(\n", " self,\n", " policies,\n", " bandit,\n", " n_trials,\n", " n_duplicates,\n", " plot=True,\n", " seed=None,\n", " smooth_weight=0.999,\n", " out_dir=None,\n", " ):\n", " \"\"\"\n", " Compare the performance of multiple policies on the same bandit\n", " environment, generating a plot for each.\n", "\n", " Parameters\n", " ----------\n", " policies : list of :class:`BanditPolicyBase ` instances\n", " The multi-armed bandit policies to compare.\n", " bandit : :class:`Bandit ` instance\n", " The environment to train the policies on.\n", " n_trials : int\n", " The number of trials per run.\n", " n_duplicates: int\n", " The number of times to evaluate each policy on the bandit\n", " environment. Larger values permit a better estimate of the\n", " variance in payoff / cumulative regret for each policy.\n", " plot : bool\n", " Whether to generate a plot of the policy's average reward and\n", " regret across the episodes. Default is True.\n", " seed : int\n", " The seed for the random number generator. Default is None.\n", " smooth_weight : float in [0, 1]\n", " The smoothing weight. Values closer to 0 result in less smoothing,\n", " values closer to 1 produce more aggressive smoothing. Default is\n", " 0.999.\n", " out_dir : str or None\n", " Plots will be saved to this directory if `plot` is True. If\n", " `out_dir` is None, plots will not be saved. Default is None.\n", " \"\"\" # noqa: E501\n", " self.init_logs(policies)\n", "\n", " all_axes = [None] * len(policies)\n", " if plot and _PLOTTING:\n", " fig, all_axes = plt.subplots(len(policies), 2, sharex=True)\n", " fig.set_size_inches(10.5, len(policies) * 5.25)\n", "\n", " for policy, axes in zip(policies, all_axes):\n", " if seed:\n", " np.random.seed(seed)\n", "\n", " bandit.reset()\n", " policy.reset()\n", "\n", " self.train(\n", " policy,\n", " bandit,\n", " n_trials,\n", " n_duplicates,\n", " axes=axes,\n", " plot=plot,\n", " verbose=False,\n", " out_dir=out_dir,\n", " smooth_weight=smooth_weight,\n", " )\n", "\n", " # enforce the same y-ranges across plots for straightforward comparison\n", " a1_r, a2_r = zip(*[(a1.get_ylim(), a2.get_ylim()) for (a1, a2) in all_axes])\n", "\n", " a1_min = min(a1_r, key=lambda x: x[0])[0]\n", " a1_max = max(a1_r, key=lambda x: x[1])[1]\n", " a2_min = min(a2_r, key=lambda x: x[0])[0]\n", " a2_max = max(a2_r, key=lambda x: x[1])[1]\n", "\n", " for (a1, a2) in all_axes:\n", " a1.set_ylim(a1_min, a1_max)\n", " a2.set_ylim(a2_min, a2_max)\n", "\n", " if plot and _PLOTTING:\n", " if out_dir is not None:\n", " plt.savefig(op.join(out_dir, \"bandit_comparison.png\"), dpi=300)\n", " plt.show()\n", "\n", " def train(\n", " self,\n", " policy,\n", " bandit,\n", " n_trials,\n", " n_duplicates,\n", " plot=True,\n", " axes=None,\n", " verbose=True,\n", " print_every=100,\n", " smooth_weight=0.999,\n", " out_dir=None,\n", " ):\n", " \"\"\"\n", " Train a MAB policies on a multi-armed bandit problem, logging training\n", " statistics along the way.\n", "\n", " Parameters\n", " ----------\n", " policy : :class:`BanditPolicyBase ` instance\n", " The multi-armed bandit policy to train.\n", " bandit : :class:`Bandit ` instance\n", " The environment to run the policy on.\n", " n_trials : int\n", " The number of trials per run.\n", " n_duplicates: int\n", " The number of runs to evaluate\n", " plot : bool\n", " Whether to generate a plot of the policy's average reward and\n", " regret across the episodes. Default is True.\n", " axes : list of :py:class:`Axis ` instances or None\n", " If not None and ``plot = True``, these are the axes that will be\n", " used to plot the cumulative reward and regret, respectively.\n", " Default is None.\n", " verbose : boolean\n", " Whether to print run statistics during training. Default is True.\n", " print_every : int\n", " The number of episodes to run before printing loss values to\n", " stdout. This is ignored if ``verbose`` is false. Default is 100.\n", " smooth_weight : float in [0, 1]\n", " The smoothing weight. Values closer to 0 result in less smoothing,\n", " values closer to 1 produce more aggressive smoothing. Default is\n", " 0.999.\n", " out_dir : str or None\n", " Plots will be saved to this directory if `plot` is True. If\n", " `out_dir` is None, plots will not be saved. Default is None.\n", "\n", " Returns\n", " -------\n", " policy : :class:`BanditPolicyBase ` instance\n", " The policy trained during the last (i.e. most recent) duplicate\n", " run.\n", " \"\"\" # noqa: E501\n", " if not str(policy) in self.logs:\n", " self.init_logs(policy)\n", "\n", " p = str(policy)\n", " D, L = n_duplicates, self.logs\n", "\n", " for d in range(D):\n", " if verbose:\n", " print(\"\\nDUPLICATE {}/{}\\n\".format(d + 1, D))\n", "\n", " bandit.reset()\n", " policy.reset()\n", "\n", " avg_oracle_reward, cregret = 0, 0\n", " for trial_id in range(n_trials):\n", " rwd, arm, orwd, oarm = self._train_step(bandit, policy)\n", "\n", " loss = mse(bandit, policy)\n", " regret = orwd - rwd\n", "\n", " avg_oracle_reward += orwd\n", " cregret += regret\n", "\n", " L[p][\"mse\"][trial_id + 1].append(loss)\n", " L[p][\"reward\"][trial_id + 1].append(rwd)\n", " L[p][\"regret\"][trial_id + 1].append(regret)\n", " L[p][\"cregret\"][trial_id + 1].append(cregret)\n", " L[p][\"optimal_arm\"][trial_id + 1].append(oarm)\n", " L[p][\"selected_arm\"][trial_id + 1].append(arm)\n", " L[p][\"optimal_reward\"][trial_id + 1].append(orwd)\n", "\n", " if (trial_id + 1) % print_every == 0 and verbose:\n", " fstr = \"Trial {}/{}, {}/{}, Regret: {:.4f}\"\n", " print(fstr.format(trial_id + 1, n_trials, d + 1, D, regret))\n", "\n", " avg_oracle_reward /= n_trials\n", "\n", " if verbose:\n", " self._print_run_summary(bandit, policy, regret)\n", "\n", " if plot and _PLOTTING:\n", " self._plot_reward(avg_oracle_reward, policy, smooth_weight, axes, out_dir)\n", "\n", " return policy\n", "\n", " def _train_step(self, bandit, policy):\n", " P, B = policy, bandit\n", " C = B.get_context() if hasattr(B, \"get_context\") else None\n", " rwd, arm = P.act(B, C)\n", " oracle_rwd, oracle_arm = B.oracle_payoff(C)\n", " return rwd, arm, oracle_rwd, oracle_arm\n", "\n", " def init_logs(self, policies):\n", " \"\"\"\n", " Initialize the episode logs.\n", "\n", " Notes\n", " -----\n", " Training logs are represented as a nested set of dictionaries with the\n", " following structure:\n", "\n", " log[model_id][metric][trial_number][duplicate_number]\n", "\n", " For example, ``logs['model1']['regret'][3][1]`` holds the regret value\n", " accrued on the 3rd trial of the 2nd duplicate run for model1.\n", "\n", " Available fields are 'regret', 'cregret' (cumulative regret), 'reward',\n", " 'mse' (mean-squared error between estimated arm EVs and the true EVs),\n", " 'optimal_arm', 'selected_arm', and 'optimal_reward'.\n", " \"\"\"\n", " if not isinstance(policies, list):\n", " policies = [policies]\n", "\n", " self.logs = {\n", " str(p): {\n", " \"mse\": defaultdict(lambda: []),\n", " \"regret\": defaultdict(lambda: []),\n", " \"reward\": defaultdict(lambda: []),\n", " \"cregret\": defaultdict(lambda: []),\n", " \"optimal_arm\": defaultdict(lambda: []),\n", " \"selected_arm\": defaultdict(lambda: []),\n", " \"optimal_reward\": defaultdict(lambda: []),\n", " }\n", " for p in policies\n", " }\n", "\n", " def _print_run_summary(self, bandit, policy, regret):\n", " if not hasattr(policy, \"ev_estimates\") or len(policy.ev_estimates) == 0:\n", " return None\n", "\n", " evs, se = bandit.arm_evs, []\n", " fstr = \"Arm {}: {:.4f} v. {:.4f}\"\n", " ests = sorted(policy.ev_estimates.items(), key=lambda x: x[0])\n", " print(\"\\n\\nEstimated vs. Real EV\\n\" + \"-\" * 21)\n", " for ix, (est, ev) in enumerate(zip(ests, evs)):\n", " print(fstr.format(ix + 1, est[1], ev))\n", " se.append((est[1] - ev) ** 2)\n", " fstr = \"\\nFinal MSE: {:.4f}\\nFinal Regret: {:.4f}\\n\\n\"\n", " print(fstr.format(np.mean(se), regret))\n", "\n", " def _plot_reward(self, optimal_rwd, policy, smooth_weight, axes=None, out_dir=None):\n", " L = self.logs[str(policy)]\n", " smds = self._smoothed_metrics(policy, optimal_rwd, smooth_weight)\n", "\n", " if axes is None:\n", " fig, [ax1, ax2] = plt.subplots(1, 2)\n", " else:\n", " assert len(axes) == 2\n", " ax1, ax2 = axes\n", "\n", " e_ids = range(1, len(L[\"reward\"]) + 1)\n", " plot_params = [[ax1, ax2], [\"reward\", \"cregret\"], [\"b\", \"r\"], [optimal_rwd, 0]]\n", "\n", " for (ax, m, c, opt) in zip(*plot_params):\n", " avg, std = \"sm_{}_avg sm_{}_std\".format(m, m).split()\n", " ax.plot(e_ids, smds[avg], color=c)\n", " ax.axhline(opt, 0, 1, color=c, ls=\"--\")\n", " ax.fill_between(\n", " e_ids,\n", " smds[avg] + smds[std],\n", " smds[avg] - smds[std],\n", " color=c,\n", " alpha=0.25,\n", " )\n", " ax.set_xlabel(\"Trial\")\n", " m = \"Cumulative Regret\" if m == \"cregret\" else m\n", " ax.set_ylabel(\"Smoothed Avg. {}\".format(m.title()))\n", "\n", " if axes is None:\n", " ax.set_aspect(np.diff(ax.get_xlim()) / np.diff(ax.get_ylim()))\n", "\n", " if axes is not None:\n", " ax.set_title(str(policy))\n", "\n", " if axes is None:\n", " fig.suptitle(str(policy))\n", " fig.tight_layout()\n", "\n", " if out_dir is not None:\n", " bid = policy.hyperparameters[\"id\"]\n", " plt.savefig(op.join(out_dir, f\"{bid}.png\"), dpi=300)\n", " plt.show()\n", " return ax1, ax2\n", "\n", " def _smoothed_metrics(self, policy, optimal_rwd, smooth_weight):\n", " L = self.logs[str(policy)]\n", "\n", " # pre-allocate smoothed data structure\n", " smds = {}\n", " for m in L.keys():\n", " if m == \"selections\":\n", " continue\n", "\n", " smds[\"sm_{}_avg\".format(m)] = np.zeros(len(L[\"reward\"]))\n", " smds[\"sm_{}_avg\".format(m)][0] = np.mean(L[m][1])\n", "\n", " smds[\"sm_{}_std\".format(m)] = np.zeros(len(L[\"reward\"]))\n", " smds[\"sm_{}_std\".format(m)][0] = np.std(L[m][1])\n", "\n", " smoothed = {m: L[m][1] for m in L.keys()}\n", " for e_id in range(2, len(L[\"reward\"]) + 1):\n", " for m in L.keys():\n", " if m == \"selections\":\n", " continue\n", " prev, cur = smoothed[m], L[m][e_id]\n", " smoothed[m] = [smooth(p, c, smooth_weight) for p, c in zip(prev, cur)]\n", " smds[\"sm_{}_avg\".format(m)][e_id - 1] = np.mean(smoothed[m])\n", " smds[\"sm_{}_std\".format(m)][e_id - 1] = np.std(smoothed[m])\n", " return smds\n"]} {"path": "numpy_ml/bandits/policies.py", "content": ["\"\"\"A module containing exploration policies for various multi-armed bandit problems.\"\"\"\n", "\n", "from abc import ABC, abstractmethod\n", "from collections import defaultdict\n", "\n", "import numpy as np\n", "\n", "from ..utils.testing import is_number\n", "\n", "\n", "class BanditPolicyBase(ABC):\n", " def __init__(self):\n", " \"\"\"A simple base class for multi-armed bandit policies\"\"\"\n", " self.step = 0\n", " self.ev_estimates = {}\n", " self.is_initialized = False\n", " super().__init__()\n", "\n", " def __repr__(self):\n", " \"\"\"Return a string representation of the policy\"\"\"\n", " HP = self.hyperparameters\n", " params = \", \".join([\"{}={}\".format(k, v) for (k, v) in HP.items() if k != \"id\"])\n", " return \"{}({})\".format(HP[\"id\"], params)\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary containing the policy hyperparameters\"\"\"\n", " pass\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary containing the current policy parameters\"\"\"\n", " pass\n", "\n", " def act(self, bandit, context=None):\n", " \"\"\"\n", " Select an arm and sample from its payoff distribution.\n", "\n", " Parameters\n", " ----------\n", " bandit : :class:`Bandit ` instance\n", " The multi-armed bandit to act upon\n", " context : :py:class:`ndarray ` of shape `(D,)` or None\n", " The context vector for the current timestep if interacting with a\n", " contextual bandit. Otherwise, this argument is unused. Default is\n", " None.\n", "\n", " Returns\n", " -------\n", " rwd : float\n", " The reward received after pulling ``arm_id``.\n", " arm_id : int\n", " The arm that was pulled to generate ``rwd``.\n", " \"\"\"\n", " if not self.is_initialized:\n", " self._initialize_params(bandit)\n", "\n", " arm_id = self._select_arm(bandit, context)\n", " rwd = self._pull_arm(bandit, arm_id, context)\n", " self._update_params(arm_id, rwd, context)\n", " return rwd, arm_id\n", "\n", " def reset(self):\n", " \"\"\"Reset the policy parameters and counters to their initial states.\"\"\"\n", " self.step = 0\n", " self._reset_params()\n", " self.is_initialized = False\n", "\n", " def _pull_arm(self, bandit, arm_id, context):\n", " \"\"\"Execute a bandit action and return the received reward.\"\"\"\n", " self.step += 1\n", " return bandit.pull(arm_id, context)\n", "\n", " @abstractmethod\n", " def _select_arm(self, bandit, context):\n", " \"\"\"Select an arm based on the current context\"\"\"\n", " pass\n", "\n", " @abstractmethod\n", " def _update_params(self, bandit, context):\n", " \"\"\"Update the policy parameters after an interaction\"\"\"\n", " pass\n", "\n", " @abstractmethod\n", " def _initialize_params(self, bandit):\n", " \"\"\"\n", " Initialize any policy-specific parameters that depend on information\n", " from the bandit environment.\n", " \"\"\"\n", " pass\n", "\n", " @abstractmethod\n", " def _reset_params(self):\n", " \"\"\"\n", " Reset any model-specific parameters. This gets called within the\n", " public `self.reset()` method.\n", " \"\"\"\n", " pass\n", "\n", "\n", "class EpsilonGreedy(BanditPolicyBase):\n", " def __init__(self, epsilon=0.05, ev_prior=0.5):\n", " r\"\"\"\n", " An epsilon-greedy policy for multi-armed bandit problems.\n", "\n", " Notes\n", " -----\n", " Epsilon-greedy policies greedily select the arm with the highest\n", " expected payoff with probability :math:`1-\\epsilon`, and selects an arm\n", " uniformly at random with probability :math:`\\epsilon`:\n", "\n", " .. math::\n", "\n", " P(a) = \\left\\{\n", " \\begin{array}{lr}\n", " \\epsilon / N + (1 - \\epsilon) &\\text{if }\n", " a = \\arg \\max_{a' \\in \\mathcal{A}}\n", " \\mathbb{E}_{q_{\\hat{\\theta}}}[r \\mid a']\\\\\n", " \\epsilon / N &\\text{otherwise}\n", " \\end{array}\n", " \\right.\n", "\n", " where :math:`N = |\\mathcal{A}|` is the number of arms,\n", " :math:`q_{\\hat{\\theta}}` is the estimate of the arm payoff\n", " distribution under current model parameters :math:`\\hat{\\theta}`, and\n", " :math:`\\mathbb{E}_{q_{\\hat{\\theta}}}[r \\mid a']` is the expected\n", " reward under :math:`q_{\\hat{\\theta}}` of receiving reward `r` after\n", " taking action :math:`a'`.\n", "\n", " Parameters\n", " ----------\n", " epsilon : float in [0, 1]\n", " The probability of taking a random action. Default is 0.05.\n", " ev_prior : float\n", " The starting expected payoff for each arm before any data has been\n", " observed. Default is 0.5.\n", " \"\"\"\n", " super().__init__()\n", " self.epsilon = epsilon\n", " self.ev_prior = ev_prior\n", " self.pull_counts = defaultdict(lambda: 0)\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary containing the current policy parameters\"\"\"\n", " return {\"ev_estimates\": self.ev_estimates}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary containing the policy hyperparameters\"\"\"\n", " return {\n", " \"id\": \"EpsilonGreedy\",\n", " \"epsilon\": self.epsilon,\n", " \"ev_prior\": self.ev_prior,\n", " }\n", "\n", " def _initialize_params(self, bandit):\n", " \"\"\"\n", " Initialize any policy-specific parameters that depend on information\n", " from the bandit environment.\n", " \"\"\"\n", " self.ev_estimates = {i: self.ev_prior for i in range(bandit.n_arms)}\n", " self.is_initialized = True\n", "\n", " def _select_arm(self, bandit, context=None):\n", " if np.random.rand() < self.epsilon:\n", " arm_id = np.random.choice(bandit.n_arms)\n", " else:\n", " ests = self.ev_estimates\n", " (arm_id, _) = max(ests.items(), key=lambda x: x[1])\n", " return arm_id\n", "\n", " def _update_params(self, arm_id, reward, context=None):\n", " E, C = self.ev_estimates, self.pull_counts\n", " C[arm_id] += 1\n", " E[arm_id] += (reward - E[arm_id]) / (C[arm_id])\n", "\n", " def _reset_params(self):\n", " \"\"\"\n", " Reset any model-specific parameters. This gets called within the\n", " public `self.reset()` method.\n", " \"\"\"\n", " self.ev_estimates = {}\n", " self.pull_counts = defaultdict(lambda: 0)\n", "\n", "\n", "class UCB1(BanditPolicyBase):\n", " def __init__(self, C=1, ev_prior=0.5):\n", " r\"\"\"\n", " A UCB1 policy for multi-armed bandit problems.\n", "\n", " Notes\n", " -----\n", " The UCB1 algorithm [*]_ guarantees the cumulative regret is bounded by log\n", " `t`, where `t` is the current timestep. To make this guarantee UCB1\n", " assumes all arm payoffs are between 0 and 1.\n", "\n", " Under UCB1, the upper confidence bound on the expected value for\n", " pulling arm `a` at timestep `t` is:\n", "\n", " .. math::\n", "\n", " \\text{UCB}(a, t) = \\text{EV}_t(a) + C \\sqrt{\\frac{2 \\log t}{N_t(a)}}\n", "\n", " where :math:`\\text{EV}_t(a)` is the average of the rewards recieved so\n", " far from pulling arm `a`, `C` is a free parameter controlling the\n", " \"optimism\" of the confidence upper bound for :math:`\\text{UCB}(a, t)`\n", " (for logarithmic regret bounds, `C` must equal 1), and :math:`N_t(a)`\n", " is the number of times arm `a` has been pulled during the previous `t -\n", " 1` timesteps.\n", "\n", " References\n", " ----------\n", " .. [*] Auer, P., Cesa-Bianchi, N., & Fischer, P. (2002). Finite-time\n", " analysis of the multiarmed bandit problem. *Machine Learning,\n", " 47(2)*.\n", "\n", " Parameters\n", " ----------\n", " C : float in (0, +infinity)\n", " A confidence/optimisim parameter affecting the degree of\n", " exploration, where larger values encourage greater exploration. The\n", " UCB1 algorithm assumes `C=1`. Default is 1.\n", " ev_prior : float\n", " The starting expected value for each arm before any data has been\n", " observed. Default is 0.5.\n", " \"\"\"\n", " self.C = C\n", " self.ev_prior = ev_prior\n", " super().__init__()\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary containing the current policy parameters\"\"\"\n", " return {\"ev_estimates\": self.ev_estimates}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary containing the policy hyperparameters\"\"\"\n", " return {\n", " \"C\": self.C,\n", " \"id\": \"UCB1\",\n", " \"ev_prior\": self.ev_prior,\n", " }\n", "\n", " def _initialize_params(self, bandit):\n", " \"\"\"\n", " Initialize any policy-specific parameters that depend on information\n", " from the bandit environment.\n", " \"\"\"\n", " self.ev_estimates = {i: self.ev_prior for i in range(bandit.n_arms)}\n", " self.is_initialized = True\n", "\n", " def _select_arm(self, bandit, context=None):\n", " # add eps to avoid divide-by-zero errors on the first pull of each arm\n", " eps = np.finfo(float).eps\n", " N, T = bandit.n_arms, self.step + 1\n", " E, C = self.ev_estimates, self.pull_counts\n", " scores = [E[a] + self.C * np.sqrt(np.log(T) / (C[a] + eps)) for a in range(N)]\n", " return np.argmax(scores)\n", "\n", " def _update_params(self, arm_id, reward, context=None):\n", " E, C = self.ev_estimates, self.pull_counts\n", " C[arm_id] += 1\n", " E[arm_id] += (reward - E[arm_id]) / (C[arm_id])\n", "\n", " def _reset_params(self):\n", " \"\"\"\n", " Reset any model-specific parameters. This gets called within the\n", " public :method:`reset` method.\n", " \"\"\"\n", " self.ev_estimates = {}\n", " self.pull_counts = defaultdict(lambda: 0)\n", "\n", "\n", "class ThompsonSamplingBetaBinomial(BanditPolicyBase):\n", " def __init__(self, alpha=1, beta=1):\n", " r\"\"\"\n", " A conjugate Thompson sampling [1]_ [2]_ policy for multi-armed bandits with\n", " Bernoulli likelihoods.\n", "\n", " Notes\n", " -----\n", " The policy assumes independent Beta priors on the Bernoulli arm payoff\n", " probabilities, :math:`\\theta`:\n", "\n", " .. math::\n", "\n", " \\theta_k \\sim \\text{Beta}(\\alpha_k, \\beta_k) \\\\\n", " r \\mid \\theta_k \\sim \\text{Bernoulli}(\\theta_k)\n", "\n", " where :math:`k \\in \\{1,\\ldots,K \\}` indexes arms in the MAB and\n", " :math:`\\theta_k` is the parameter of the Bernoulli likelihood for arm\n", " `k`. The sampler begins by selecting an arm with probability\n", " proportional to its payoff probability under the initial Beta prior.\n", " After pulling the sampled arm and receiving a reward, `r`, the sampler\n", " computes the posterior over the model parameters (arm payoffs) via\n", " Bayes' rule, and then samples a new action in proportion to its payoff\n", " probability under this posterior. This process (i.e., sample action\n", " from posterior, take action and receive reward, compute updated\n", " posterior) is repeated until the number of trials is exhausted.\n", "\n", " Note that due to the conjugacy between the Beta prior and Bernoulli\n", " likelihood the posterior for each arm will also be Beta-distributed and\n", " can computed and sampled from efficiently:\n", "\n", " .. math::\n", "\n", " \\theta_k \\mid r \\sim \\text{Beta}(\\alpha_k + r, \\beta_k + 1 - r)\n", "\n", " References\n", " ----------\n", " .. [1] Thompson, W. (1933). On the likelihood that one unknown\n", " probability exceeds another in view of the evidence of two samples.\n", " *Biometrika, 25(3/4)*, 285-294.\n", " .. [2] Chapelle, O., & Li, L. (2011). An empirical evaluation of\n", " Thompson sampling. *Advances in Neural Information Processing\n", " Systems, 24*, 2249-2257.\n", "\n", " Parameters\n", " ----------\n", " alpha : float or list of length `K`\n", " Parameter for the Beta prior on arm payouts. If a float, this value\n", " will be used in the prior for all of the `K` arms.\n", " beta : float or list of length `K`\n", " Parameter for the Beta prior on arm payouts. If a float, this value\n", " will be used in the prior for all of the `K` arms.\n", " \"\"\"\n", " super().__init__()\n", " self.alphas, self.betas = [], []\n", " self.alpha, self.beta = alpha, beta\n", " self.is_initialized = False\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary containing the current policy parameters\"\"\"\n", " return {\n", " \"ev_estimates\": self.ev_estimates,\n", " \"alphas\": self.alphas,\n", " \"betas\": self.betas,\n", " }\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary containing the policy hyperparameters\"\"\"\n", " return {\n", " \"id\": \"ThompsonSamplingBetaBinomial\",\n", " \"alpha\": self.alpha,\n", " \"beta\": self.beta,\n", " }\n", "\n", " def _initialize_params(self, bandit):\n", " bhp = bandit.hyperparameters\n", " fstr = \"ThompsonSamplingBetaBinomial only defined for BernoulliBandit, got: {}\"\n", " assert bhp[\"id\"] == \"BernoulliBandit\", fstr.format(bhp[\"id\"])\n", "\n", " # initialize the model prior\n", " if is_number(self.alpha):\n", " self.alphas = [self.alpha] * bandit.n_arms\n", " if is_number(self.beta):\n", " self.betas = [self.beta] * bandit.n_arms\n", " assert len(self.alphas) == len(self.betas) == bandit.n_arms\n", "\n", " self.ev_estimates = {i: self._map_estimate(i, 1) for i in range(bandit.n_arms)}\n", " self.is_initialized = True\n", "\n", " def _select_arm(self, bandit, context):\n", " if not self.is_initialized:\n", " self._initialize_prior(bandit)\n", "\n", " # draw a sample from the current model posterior\n", " posterior_sample = np.random.beta(self.alphas, self.betas)\n", "\n", " # greedily select an action based on this sample\n", " return np.argmax(posterior_sample)\n", "\n", " def _update_params(self, arm_id, rwd, context):\n", " \"\"\"\n", " Compute the parameters of the Beta posterior, P(payoff prob | rwd),\n", " for arm `arm_id`.\n", " \"\"\"\n", " self.alphas[arm_id] += rwd\n", " self.betas[arm_id] += 1 - rwd\n", " self.ev_estimates[arm_id] = self._map_estimate(arm_id, rwd)\n", "\n", " def _map_estimate(self, arm_id, rwd):\n", " \"\"\"Compute the current MAP estimate for an arm's payoff probability\"\"\"\n", " A, B = self.alphas, self.betas\n", " if A[arm_id] > 1 and B[arm_id] > 1:\n", " map_payoff_prob = (A[arm_id] - 1) / (A[arm_id] + B[arm_id] - 2)\n", " elif A[arm_id] < 1 and B[arm_id] < 1:\n", " map_payoff_prob = rwd # 0 or 1 equally likely, make a guess\n", " elif A[arm_id] <= 1 and B[arm_id] > 1:\n", " map_payoff_prob = 0\n", " elif A[arm_id] > 1 and B[arm_id] <= 1:\n", " map_payoff_prob = 1\n", " else:\n", " map_payoff_prob = 0.5\n", " return map_payoff_prob\n", "\n", " def _reset_params(self):\n", " \"\"\"\n", " Reset any model-specific parameters. This gets called within the\n", " public `self.reset()` method.\n", " \"\"\"\n", " self.alphas, self.betas = [], []\n", " self.ev_estimates = {}\n", "\n", "\n", "class LinUCB(BanditPolicyBase):\n", " def __init__(self, alpha=1):\n", " \"\"\"\n", " A disjoint linear UCB policy [*]_ for contextual linear bandits.\n", "\n", " Notes\n", " -----\n", " LinUCB is only defined for :class:`ContextualLinearBandit ` environments.\n", "\n", " References\n", " ----------\n", " .. [*] Li, L., Chu, W., Langford, J., & Schapire, R. (2010). A\n", " contextual-bandit approach to personalized news article\n", " recommendation. In *Proceedings of the 19th International Conference\n", " on World Wide Web*, 661-670.\n", "\n", " Parameters\n", " ----------\n", " alpha : float\n", " A confidence/optimisim parameter affecting the amount of\n", " exploration. Default is 1.\n", " \"\"\" # noqa\n", " super().__init__()\n", "\n", " self.alpha = alpha\n", " self.A, self.b = [], []\n", " self.is_initialized = False\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"A dictionary containing the current policy parameters\"\"\"\n", " return {\"ev_estimates\": self.ev_estimates, \"A\": self.A, \"b\": self.b}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"A dictionary containing the policy hyperparameters\"\"\"\n", " return {\n", " \"id\": \"LinUCB\",\n", " \"alpha\": self.alpha,\n", " }\n", "\n", " def _initialize_params(self, bandit):\n", " \"\"\"\n", " Initialize any policy-specific parameters that depend on information\n", " from the bandit environment.\n", " \"\"\"\n", " bhp = bandit.hyperparameters\n", " fstr = \"LinUCB only defined for contextual linear bandits, got: {}\"\n", " assert bhp[\"id\"] == \"ContextualLinearBandit\", fstr.format(bhp[\"id\"])\n", "\n", " self.A, self.b = [], []\n", " for _ in range(bandit.n_arms):\n", " self.A.append(np.eye(bandit.D))\n", " self.b.append(np.zeros(bandit.D))\n", "\n", " self.is_initialized = True\n", "\n", " def _select_arm(self, bandit, context):\n", " probs = []\n", " for a in range(bandit.n_arms):\n", " C, A, b = context[:, a], self.A[a], self.b[a]\n", " A_inv = np.linalg.inv(A)\n", " theta_hat = A_inv @ b\n", " p = theta_hat @ C + self.alpha * np.sqrt(C.T @ A_inv @ C)\n", "\n", " probs.append(p)\n", " return np.argmax(probs)\n", "\n", " def _update_params(self, arm_id, rwd, context):\n", " \"\"\"Compute the parameters for A and b.\"\"\"\n", " self.A[arm_id] += context[:, arm_id] @ context[:, arm_id].T\n", " self.b[arm_id] += rwd * context[:, arm_id]\n", "\n", " def _reset_params(self):\n", " \"\"\"\n", " Reset any model-specific parameters. This gets called within the\n", " public `self.reset()` method.\n", " \"\"\"\n", " self.A, self.b = [], []\n", " self.ev_estimates = {}\n"]} {"path": "numpy_ml/factorization/factors.py", "content": ["\"\"\"Algorithms for approximate matrix factorization\"\"\"\n", "\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "\n", "\n", "class VanillaALS:\n", " def __init__(self, K, alpha=1, max_iter=200, tol=1e-4):\n", " r\"\"\"\n", " Approximately factor a real-valued matrix using regularized alternating\n", " least-squares (ALS).\n", "\n", " Notes\n", " -----\n", " The regularized ALS minimization problem is\n", "\n", " .. math::\n", "\n", " \\min_{\\mathbf{W}, \\mathbf{H}} ||\\mathbf{X} - \\mathbf{WH}||^2 -\n", " \\alpha \\left(\n", " ||\\mathbf{W}||^2 + ||\\mathbf{H}||^2\n", " \\right)\n", "\n", " where :math:`||\\cdot||` denotes the Frobenius norm, **X** is the\n", " :math:`N \\times M` data matrix, :math:`\\mathbf{W}` and\n", " :math:`\\mathbf{H}` are learned factor matrices with dimensions :math:`N\n", " \\times K` and :math:`K \\times M`, respectively, and :math:`\\alpha` is a\n", " user-defined regularization weight.\n", "\n", " ALS proceeds by alternating between fixing **W** and optimizing for\n", " **H** and fixing **H** and optimizing for **W**. Vanilla ALS has no\n", " convergance guarantees and the objective function is prone to\n", " oscillation across updates, particularly for dense input matrices [1]_.\n", "\n", " References\n", " ----------\n", " .. [1] Gillis, N. (2014). The why and how of nonnegative matrix\n", " factorization. *Regularization, optimization, kernels, and support\n", " vector machines, 12(257)*, 257-291.\n", "\n", " Parameters\n", " ----------\n", " K : int\n", " The number of latent factors to include in the factor matrices W\n", " and H.\n", " alpha : float\n", " The L2 regularization weight on the factor matrices. Larger\n", " values result in more aggressive regularization. Default is 1.\n", " max_iter : int\n", " The maximum number of iterations to run before stopping. Default is\n", " 200.\n", " tol : float\n", " The tolerance for the stopping condition. Default is 1e-4.\n", " \"\"\"\n", " self.K = K\n", " self.W = None\n", " self.H = None\n", " self.tol = tol\n", " self.alpha = alpha\n", " self.max_iter = max_iter\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"Return a dictionary of the current model parameters\"\"\"\n", " return {\"W\": self.W, \"H\": self.H}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary of the model hyperparameters\"\"\"\n", " return {\n", " \"id\": \"ALSFactor\",\n", " \"K\": self.K,\n", " \"tol\": self.tol,\n", " \"alpha\": self.alpha,\n", " \"max_iter\": self.max_iter,\n", " }\n", "\n", " def _init_factor_matrices(self, X, W=None, H=None):\n", " \"\"\"Randomly initialize the factor matrices\"\"\"\n", " N, M = X.shape\n", " scale = np.sqrt(X.mean() / self.K)\n", " self.W = np.random.rand(N, self.K) * scale if W is None else W\n", " self.H = np.random.rand(self.K, M) * scale if H is None else H\n", "\n", " assert self.W.shape == (N, self.K)\n", " assert self.H.shape == (self.K, M)\n", "\n", " def _loss(self, X, Xhat):\n", " \"\"\"Regularized Frobenius loss\"\"\"\n", " alpha, W, H = self.alpha, self.W, self.H\n", " sq_fnorm = lambda x: np.sum(x ** 2) # noqa: E731\n", " return sq_fnorm(X - Xhat) + alpha * (sq_fnorm(W) + sq_fnorm(H))\n", "\n", " def _update_factor(self, X, A):\n", " \"\"\"Perform the ALS update\"\"\"\n", " T1 = np.linalg.inv(A.T @ A + self.alpha * np.eye(self.K))\n", " return X @ A @ T1\n", "\n", " def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):\n", " \"\"\"\n", " Factor a data matrix into two low rank factors via ALS.\n", "\n", " Parameters\n", " ----------\n", " X : numpy array of shape `(N, M)`\n", " The data matrix to factor.\n", " W : numpy array of shape `(N, K)` or None\n", " An initial value for the `W` factor matrix. If None, initialize `W`\n", " randomly. Default is None.\n", " H : numpy array of shape `(K, M)` or None\n", " An initial value for the `H` factor matrix. If None, initialize `H`\n", " randomly. Default is None.\n", " n_initializations : int\n", " Number of re-initializations of the algorithm to perform before\n", " taking the answer with the lowest reconstruction error. This value\n", " is ignored and set to 1 if both `W` and `H` are not None. Default\n", " is 10.\n", " verbose : bool\n", " Whether to print the loss at each iteration. Default is False.\n", " \"\"\"\n", " if W is not None and H is not None:\n", " n_initializations = 1\n", "\n", " best_loss = np.inf\n", " for f in range(n_initializations):\n", " if verbose:\n", " print(\"\\nINITIALIZATION {}\".format(f + 1))\n", "\n", " new_W, new_H, loss = self._fit(X, W, H, verbose)\n", "\n", " if loss <= best_loss:\n", " best_loss = loss\n", " best_W, best_H = deepcopy(new_W), deepcopy(new_H)\n", "\n", " self.W, self.H = best_W, best_H\n", "\n", " if verbose:\n", " print(\"\\nFINAL LOSS: {}\".format(best_loss))\n", "\n", " def _fit(self, X, W, H, verbose):\n", " self._init_factor_matrices(X, W, H)\n", " W, H = self.W, self.H\n", "\n", " for i in range(self.max_iter):\n", " W = self._update_factor(X, H.T)\n", " H = self._update_factor(X.T, W).T\n", "\n", " loss = self._loss(X, W @ H)\n", "\n", " if verbose:\n", " print(\"[Iter {}] Loss: {:.8f}\".format(i + 1, loss))\n", "\n", " if loss <= self.tol:\n", " break\n", "\n", " return W, H, loss\n", "\n", "\n", "class NMF:\n", " def __init__(self, K, max_iter=200, tol=1e-4):\n", " r\"\"\"\n", " Nonnegative matrix factorization (NMF) performed using fast\n", " hierarchical alternating least squares (HALS) [*]_.\n", "\n", " Notes\n", " -----\n", " The NMF minimization problem is\n", "\n", " .. math::\n", "\n", " \\min_{\\mathbf{W}, \\mathbf{H}} ||\\mathbf{X} - \\mathbf{WH}||^2\n", " \\ \\ \\ \\ \\text{subject to } \\mathbf{W}, \\mathbf{H} \\geq 0\n", "\n", " where :math:`||\\cdot||` denotes the Frobenius norm, and the notation\n", " :math:`\\mathbf{A} \\geq 0` indicates that each element of **A** is\n", " greater than or equal to 0. In the above equation, **X** is the\n", " :math:`N \\times M` data matrix, :math:`\\mathbf{W}` and\n", " :math:`\\mathbf{H}` are learned factor matrices with dimensions :math:`N\n", " \\times K` and :math:`K \\times M`, respectively.\n", "\n", " As with other ALS-based approaches, there is no guarantee that NMF will\n", " converge to a stationary point, let alone a global minimum. As a result\n", " it is generally good practice to run the algorithm multiple times with\n", " different initializations, taking the outcome that achieves the lowest\n", " reconstruction error.\n", "\n", " References\n", " ----------\n", " .. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for\n", " large scale nonnegative matrix and tensor factorizations. *IEICE\n", " Transactions on Fundamentals of Electronics, Communications and\n", " Computer Sciences, 92(3)*, 708-721.\n", "\n", " Parameters\n", " ----------\n", " K : int\n", " The number of latent factors to include in the factor matrices **W**\n", " and **H**.\n", " max_iter : int\n", " The maximum number of iterations to run before stopping. Default is\n", " 200.\n", " tol : float\n", " The tolerance for the stopping condition. Default is 1e-4.\n", " \"\"\"\n", " self.K = K\n", " self.W = None\n", " self.H = None\n", " self.tol = tol\n", " self.max_iter = max_iter\n", "\n", " @property\n", " def parameters(self):\n", " \"\"\"Return a dictionary of the current model parameters\"\"\"\n", " return {\"W\": self.W, \"H\": self.H}\n", "\n", " @property\n", " def hyperparameters(self):\n", " \"\"\"Return a dictionary of the model hyperparameters\"\"\"\n", " return {\n", " \"id\": \"NMF\",\n", " \"K\": self.K,\n", " \"tol\": self.tol,\n", " \"max_iter\": self.max_iter,\n", " }\n", "\n", " def _init_factor_matrices(self, X, W, H):\n", " \"\"\"Initialize the factor matrices using vanilla ALS\"\"\"\n", " ALS = None\n", " N, M = X.shape\n", "\n", " # initialize factors using ALS if not already defined\n", " if W is None:\n", " ALS = VanillaALS(self.K, alpha=0, max_iter=200)\n", " ALS.fit(X, verbose=False)\n", " W = ALS.W / np.linalg.norm(ALS.W, axis=0)\n", "\n", " if H is None:\n", " H = np.abs(np.random.rand(self.K, M)) if ALS is None else ALS.H\n", "\n", " assert W.shape == (N, self.K)\n", " assert H.shape == (self.K, M)\n", "\n", " self.H = H\n", " self.W = W\n", "\n", " def _loss(self, X, Xhat):\n", " \"\"\"Return the least-squares reconstruction loss between X and Xhat\"\"\"\n", " return np.sum((X - Xhat) ** 2)\n", "\n", " def _update_H(self, X, W, H):\n", " \"\"\"Perform the fast HALS update for H\"\"\"\n", " eps = np.finfo(float).eps\n", " XtW = X.T @ W # dim: (M, K)\n", " WtW = W.T @ W # dim: (K, K)\n", "\n", " for k in range(self.K):\n", " H[k, :] += XtW[:, k] - H.T @ WtW[:, k]\n", " H[k, :] = np.clip(H[k, :], eps, np.inf) # enforce nonnegativity\n", " return H\n", "\n", " def _update_W(self, X, W, H):\n", " \"\"\"Perform the fast HALS update for W\"\"\"\n", " eps = np.finfo(float).eps\n", " XHt = X @ H.T # dim: (N, K)\n", " HHt = H @ H.T # dim: (K, K)\n", "\n", " for k in range(self.K):\n", " W[:, k] = W[:, k] * HHt[k, k] + XHt[:, k] - W @ HHt[:, k]\n", " W[:, k] = np.clip(W[:, k], eps, np.inf) # enforce nonnegativity\n", "\n", " # renormalize the new column\n", " n = np.linalg.norm(W[:, k])\n", " W[:, k] /= n if n > 0 else 1.0\n", " return W\n", "\n", " def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):\n", " r\"\"\"\n", " Factor a data matrix into two nonnegative low rank factor matrices via\n", " fast HALS.\n", "\n", " Notes\n", " -----\n", " This method implements Algorithm 2 from [*]_. In contrast to vanilla\n", " ALS, HALS proceeds by minimizing a *set* of local cost functions with\n", " the same global minima. Each cost function is defined on a \"residue\" of\n", " the factor matrices **W** and **H**:\n", "\n", " .. math::\n", "\n", " \\mathbf{X}^{(j)} :=\n", " \\mathbf{X} - \\mathbf{WH}^\\top + \\mathbf{w}_j \\mathbf{h}_j^\\top\n", "\n", " where :math:`\\mathbf{X}^{(j)}` is the :math:`j^{th}` residue, **X** is\n", " the input data matrix, and :math:`\\mathbf{w}_j` and\n", " :math:`\\mathbf{h}_j` are the :math:`j^{th}` columns of the current\n", " factor matrices **W** and **H**. HALS proceeds by minimizing the cost\n", " for each residue, first with respect to :math:`\\mathbf{w}_j`, and then\n", " with respect to :math:`\\mathbf{h}_j`. In either case, the cost for\n", " residue `j`, :math:`\\mathcal{L}^{(j)}` is simply:\n", "\n", " .. math::\n", "\n", " \\mathcal{L}^{(j)} :=\n", " || \\mathbf{X}^{(j)} - \\mathbf{w}_j \\mathbf{h}_j^\\top ||\n", "\n", " where :math:`||\\cdot||` denotes the Frobenius norm. For NMF,\n", " minimization is performed under the constraint that all elements of\n", " both **W** and **H** are nonnegative.\n", "\n", " References\n", " ----------\n", " .. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for\n", " large scale nonnegative matrix and tensor factorizations. *IEICE\n", " Transactions on Fundamentals of Electronics, Communications and\n", " Computer Sciences, 92(3)*, 708-721.\n", "\n", " Parameters\n", " ----------\n", " X : numpy array of shape `(N, M)`\n", " The data matrix to factor.\n", " W : numpy array of shape `(N, K)` or None\n", " An initial value for the `W` factor matrix. If None, initialize\n", " **W** using vanilla ALS. Default is None.\n", " H : numpy array of shape `(K, M)` or None\n", " An initial value for the `H` factor matrix. If None, initialize\n", " **H** using vanilla ALS. Default is None.\n", " n_initializations : int\n", " Number of re-initializations of the algorithm to perform before\n", " taking the answer with the lowest reconstruction error. This value\n", " is ignored and set to 1 if both `W` and `H` are not None. Default\n", " is 10.\n", " verbose : bool\n", " Whether to print the loss at each iteration. Default is False.\n", " \"\"\"\n", " if W is not None and H is not None:\n", " n_initializations = 1\n", "\n", " best_loss = np.inf\n", " for f in range(n_initializations):\n", " if verbose:\n", " print(\"\\nINITIALIZATION {}\".format(f + 1))\n", "\n", " new_W, new_H, loss = self._fit(X, W, H, verbose)\n", "\n", " if loss <= best_loss:\n", " best_loss = loss\n", " best_W, best_H = deepcopy(new_W), deepcopy(new_H)\n", "\n", " self.W, self.H = best_W, best_H\n", " if verbose:\n", " print(\"\\nFINAL LOSS: {}\".format(best_loss))\n", "\n", " def _fit(self, X, W, H, verbose):\n", " self._init_factor_matrices(X, W, H)\n", "\n", " W, H = self.W, self.H\n", " for i in range(self.max_iter):\n", " H = self._update_H(X, W, H)\n", " W = self._update_W(X, W, H)\n", " loss = self._loss(X, W @ H)\n", "\n", " if verbose:\n", " print(\"[Iter {}] Loss: {:.8f}\".format(i + 1, loss))\n", "\n", " if loss <= self.tol:\n", " break\n", " return W, H, loss\n"]} {"path": "numpy_ml/factorization/__init__.py", "content": ["\"\"\"Algorithms for approximate matrix factorization\"\"\"\n", "\n", "from .factors import *\n"]} {"path": "numpy_ml/lda/lda.py", "content": ["import numpy as np\n", "from scipy.special import digamma, polygamma, gammaln\n", "\n", "\n", "class LDA(object):\n", " def __init__(self, T=10):\n", " \"\"\"\n", " Vanilla (non-smoothed) LDA model trained using variational EM.\n", " Generates maximum-likelihood estimates for model paramters\n", " `alpha` and `beta`.\n", "\n", " Parameters\n", " ----------\n", " T : int\n", " Number of topics\n", "\n", " Attributes\n", " ----------\n", " D : int\n", " Number of documents\n", " N : list of length `D`\n", " Number of words in each document\n", " V : int\n", " Number of unique word tokens across all documents\n", " phi : :py:class:`ndarray ` of shape `(D, N[d], T)`\n", " Variational approximation to word-topic distribution\n", " gamma : :py:class:`ndarray ` of shape `(D, T)`\n", " Variational approximation to document-topic distribution\n", " alpha : :py:class:`ndarray ` of shape `(1, T)`\n", " Parameter for the Dirichlet prior on the document-topic distribution\n", " beta : :py:class:`ndarray ` of shape `(V, T)`\n", " Word-topic distribution\n", " \"\"\"\n", " self.T = T\n", "\n", " def _maximize_phi(self):\n", " \"\"\"\n", " Optimize variational parameter phi\n", " \u03d5_{t, n} \u221d \u03b2_{t, w_n} e^( \u03a8(\u03b3_t) )\n", " \"\"\"\n", " D = self.D\n", " N = self.N\n", " T = self.T\n", "\n", " phi = self.phi\n", " beta = self.beta\n", " gamma = self.gamma\n", " corpus = self.corpus\n", "\n", " for d in range(D):\n", " for n in range(N[d]):\n", " for t in range(T):\n", " w_n = int(corpus[d][n])\n", " phi[d][n, t] = beta[w_n, t] * np.exp(dg(gamma, d, t))\n", "\n", " # Normalize over topics\n", " phi[d][n, :] = phi[d][n, :] / np.sum(phi[d][n, :])\n", " return phi\n", "\n", " def _maximize_gamma(self):\n", " \"\"\"\n", " Optimize variational parameter gamma\n", " \u03b3_t = \u03b1_t + \\sum_{n=1}^{N_d} \u03d5_{t, n}\n", " \"\"\"\n", " D = self.D\n", " phi = self.phi\n", " alpha = self.alpha\n", "\n", " gamma = np.tile(alpha, (D, 1)) + np.array(\n", " list(map(lambda x: np.sum(x, axis=0), phi))\n", " )\n", " return gamma\n", "\n", " def _maximize_beta(self):\n", " \"\"\"\n", " Optimize model parameter beta\n", " \u03b2_{t, n} \u221d \\sum_{d=1}^D \\sum_{i=1}^{N_d} \u03d5_{d, t, n} [ i = n]\n", " \"\"\"\n", " T = self.T\n", " V = self.V\n", "\n", " phi = self.phi\n", " beta = self.beta\n", " corpus = self.corpus\n", "\n", " for n in range(V):\n", " # Construct binary mask [i == n] to be the same shape as phi\n", " mask = [np.tile((doc == n), (T, 1)).T for doc in corpus]\n", " beta[n, :] = np.sum(\n", " np.array(list(map(lambda x: np.sum(x, axis=0), phi * mask))), axis=0\n", " )\n", "\n", " # Normalize over words\n", " for t in range(T):\n", " beta[:, t] = beta[:, t] / np.sum(beta[:, t])\n", "\n", " return beta\n", "\n", " def _maximize_alpha(self, max_iters=1000, tol=0.1):\n", " \"\"\"\n", " Optimize alpha using Blei's O(n) Newton-Raphson modification\n", " for a Hessian with special structure\n", " \"\"\"\n", " D = self.D\n", " T = self.T\n", "\n", " alpha = self.alpha\n", " gamma = self.gamma\n", "\n", " for _ in range(max_iters):\n", " alpha_old = alpha\n", "\n", " # Calculate gradient\n", " g = D * (digamma(np.sum(alpha)) - digamma(alpha)) + np.sum(\n", " digamma(gamma) - np.tile(digamma(np.sum(gamma, axis=1)), (T, 1)).T,\n", " axis=0,\n", " )\n", "\n", " # Calculate Hessian diagonal component\n", " h = -D * polygamma(1, alpha)\n", "\n", " # Calculate Hessian constant component\n", " z = D * polygamma(1, np.sum(alpha))\n", "\n", " # Calculate constant\n", " c = np.sum(g / h) / (z ** (-1.0) + np.sum(h ** (-1.0)))\n", "\n", " # Update alpha\n", " alpha = alpha - (g - c) / h\n", "\n", " # Check convergence\n", " if np.sqrt(np.mean(np.square(alpha - alpha_old))) < tol:\n", " break\n", "\n", " return alpha\n", "\n", " def _E_step(self):\n", " \"\"\"\n", " Maximize the VLB with respect to the variational parameters, \u03b3 and \u03d5\n", " \"\"\"\n", " self.phi = self._maximize_phi()\n", " self.gamma = self._maximize_gamma()\n", "\n", " def _M_step(self):\n", " \"\"\"\n", " Maximize the VLB with respect to the model parameters, \u03b1 and \u03b2\n", " \"\"\"\n", " self.beta = self._maximize_beta()\n", " self.alpha = self._maximize_alpha()\n", "\n", " def VLB(self):\n", " \"\"\"\n", " Return the variational lower bound associated with the current model\n", " parameters.\n", " \"\"\"\n", " phi = self.phi\n", " alpha = self.alpha\n", " beta = self.beta\n", " gamma = self.gamma\n", " corpus = self.corpus\n", "\n", " D = self.D\n", " T = self.T\n", " N = self.N\n", "\n", " a, b, c, _d = 0, 0, 0, 0\n", " for d in range(D):\n", " a += (\n", " gammaln(np.sum(alpha))\n", " - np.sum(gammaln(alpha))\n", " + np.sum([(alpha[t] - 1) * dg(gamma, d, t) for t in range(T)])\n", " )\n", "\n", " _d += (\n", " gammaln(np.sum(gamma[d, :]))\n", " - np.sum(gammaln(gamma[d, :]))\n", " + np.sum([(gamma[d, t] - 1) * dg(gamma, d, t) for t in range(T)])\n", " )\n", "\n", " for n in range(N[d]):\n", " w_n = int(corpus[d][n])\n", "\n", " b += np.sum([phi[d][n, t] * dg(gamma, d, t) for t in range(T)])\n", " c += np.sum([phi[d][n, t] * np.log(beta[w_n, t]) for t in range(T)])\n", " _d += np.sum([phi[d][n, t] * np.log(phi[d][n, t]) for t in range(T)])\n", "\n", " return a + b + c - _d\n", "\n", " def initialize_parameters(self):\n", " \"\"\"\n", " Provide reasonable initializations for model and variational parameters.\n", " \"\"\"\n", " T = self.T\n", " V = self.V\n", " N = self.N\n", " D = self.D\n", "\n", " # initialize model parameters\n", " self.alpha = 100 * np.random.dirichlet(10 * np.ones(T), 1)[0]\n", " self.beta = np.random.dirichlet(np.ones(V), T).T\n", "\n", " # initialize variational parameters\n", " self.phi = np.array([1 / T * np.ones([N[d], T]) for d in range(D)])\n", " self.gamma = np.tile(self.alpha, (D, 1)) + np.tile(N / T, (T, 1)).T\n", "\n", " def train(self, corpus, verbose=False, max_iter=1000, tol=5):\n", " \"\"\"\n", " Train the LDA model on a corpus of documents (bags of words).\n", "\n", " Parameters\n", " ----------\n", " corpus : list of length `D`\n", " A list of lists, with each sublist containing the tokenized text of\n", " a single document.\n", " verbose : bool\n", " Whether to print the VLB at each training iteration. Default is\n", " True.\n", " max_iter : int\n", " The maximum number of training iterations to perform before\n", " breaking. Default is 1000.\n", " tol : int\n", " Break the training loop if the difference betwen the VLB on the\n", " current iteration and the previous iteration is less than `tol`.\n", " Default is 5.\n", " \"\"\"\n", " self.D = len(corpus)\n", " self.V = len(set(np.concatenate(corpus)))\n", " self.N = np.array([len(d) for d in corpus])\n", " self.corpus = corpus\n", "\n", " self.initialize_parameters()\n", " vlb = -np.inf\n", "\n", " for i in range(max_iter):\n", " old_vlb = vlb\n", "\n", " self._E_step()\n", " self._M_step()\n", "\n", " vlb = self.VLB()\n", " delta = vlb - old_vlb\n", "\n", " if verbose:\n", " print(\"Iteration {}: {:.3f} (delta: {:.2f})\".format(i + 1, vlb, delta))\n", "\n", " if delta < tol:\n", " break\n", "\n", "\n", "#######################################################################\n", "# Utils #\n", "#######################################################################\n", "\n", "\n", "def dg(gamma, d, t):\n", " \"\"\"\n", " E[log X_t] where X_t ~ Dir\n", " \"\"\"\n", " return digamma(gamma[d, t]) - digamma(np.sum(gamma[d, :]))\n"]} {"path": "numpy_ml/lda/__init__.py", "content": ["from .lda import *\n", "from .lda_smoothed import *\n"]} {"path": "numpy_ml/lda/lda_smoothed.py", "content": ["import numpy as np\n", "\n", "\n", "class SmoothedLDA(object):\n", " def __init__(self, T, **kwargs):\n", " \"\"\"\n", " A smoothed LDA model trained using collapsed Gibbs sampling. Generates\n", " posterior mean estimates for model parameters `phi` and `theta`.\n", "\n", " Parameters\n", " ----------\n", " T : int\n", " Number of topics\n", "\n", " Attributes\n", " ----------\n", " D : int\n", " Number of documents\n", " N : int\n", " Total number of words across all documents\n", " V : int\n", " Number of unique word tokens across all documents\n", " phi : :py:class:`ndarray ` of shape `(N[d], T)`\n", " The word-topic distribution\n", " theta : :py:class:`ndarray ` of shape `(D, T)`\n", " The document-topic distribution\n", " alpha : :py:class:`ndarray ` of shape `(1, T)`\n", " Parameter for the Dirichlet prior on the document-topic distribution\n", " beta : :py:class:`ndarray ` of shape `(V, T)`\n", " Parameter for the Dirichlet prior on the topic-word distribution\n", " \"\"\"\n", " self.T = T\n", "\n", " self.alpha = (50.0 / self.T) * np.ones(self.T)\n", " if \"alpha\" in kwargs.keys():\n", " self.alpha = (kwargs[\"alpha\"]) * np.ones(self.T)\n", "\n", " self.beta = 0.01\n", " if \"beta\" in kwargs.keys():\n", " self.beta = kwargs[\"beta\"]\n", "\n", " def _init_params(self, texts, tokens):\n", " self.tokens = tokens\n", " self.D = len(texts)\n", " self.V = len(np.unique(self.tokens))\n", " self.N = np.sum(np.array([len(doc) for doc in texts]))\n", " self.word_document = np.zeros(self.N)\n", "\n", " # now that we know the number of tokens in our corpus, we can set beta\n", " self.beta = self.beta * np.ones(self.V)\n", "\n", " count = 0\n", " for doc_idx, doc in enumerate(texts):\n", " for word_idx, word in enumerate(doc):\n", " word_idx = word_idx + count\n", " self.word_document[word_idx] = doc_idx\n", " count = count + len(doc)\n", "\n", " def train(self, texts, tokens, n_gibbs=2000):\n", " \"\"\"\n", " Trains a topic model on the documents in texts.\n", "\n", " Parameters\n", " ----------\n", " texts : array of length `(D,)`\n", " The training corpus represented as an array of subarrays, where\n", " each subarray corresponds to the tokenized words of a single\n", " document.\n", " tokens : array of length `(V,)`\n", " The set of unique tokens in the documents in `texts`.\n", " n_gibbs : int\n", " The number of steps to run the collapsed Gibbs sampler during\n", " training. Default is 2000.\n", "\n", " Returns\n", " -------\n", " C_wt : :py:class:`ndarray ` of shape (V, T)\n", " The word-topic count matrix\n", " C_dt : :py:class:`ndarray ` of shape (D, T)\n", " The document-topic count matrix\n", " assignments : :py:class:`ndarray ` of shape (N, n_gibbs)\n", " The topic assignments for each word in the corpus on each Gibbs\n", " step.\n", " \"\"\"\n", " self._init_params(texts, tokens)\n", " C_wt, C_dt, assignments = self._gibbs_sampler(n_gibbs, texts)\n", " self.fit_params(C_wt, C_dt)\n", " return C_wt, C_dt, assignments\n", "\n", " def what_did_you_learn(self, top_n=10):\n", " \"\"\"\n", " Print the `top_n` most probable words under each topic\n", " \"\"\"\n", " for tt in range(self.T):\n", " top_idx = np.argsort(self.phi[:, tt])[::-1][:top_n]\n", " top_tokens = self.tokens[top_idx]\n", " print(\"\\nTop Words for Topic %s:\\n\" % (str(tt)))\n", " for token in top_tokens:\n", " print(\"\\t%s\\n\" % (str(token)))\n", "\n", " def fit_params(self, C_wt, C_dt):\n", " \"\"\"\n", " Estimate `phi`, the word-topic distribution, and `theta`, the\n", " topic-document distribution.\n", "\n", " Parameters\n", " ----------\n", " C_wt : :py:class:`ndarray ` of shape (V, T)\n", " The word-topic count matrix\n", " C_dt : :py:class:`ndarray ` of shape (D, T)\n", " The document-topic count matrix\n", "\n", " Returns\n", " -------\n", " phi : :py:class:`ndarray ` of shape `(V, T)`\n", " The word-topic distribution\n", " theta : :py:class:`ndarray ` of shape `(D, T)`\n", " The document-topic distribution\n", " \"\"\"\n", " self.phi = np.zeros([self.V, self.T])\n", " self.theta = np.zeros([self.D, self.T])\n", "\n", " b, a = self.beta[0], self.alpha[0]\n", " for ii in range(self.V):\n", " for jj in range(self.T):\n", " self.phi[ii, jj] = (C_wt[ii, jj] + b) / (\n", " np.sum(C_wt[:, jj]) + self.V * b\n", " )\n", "\n", " for dd in range(self.D):\n", " for jj in range(self.T):\n", " self.theta[dd, jj] = (C_dt[dd, jj] + a) / (\n", " np.sum(C_dt[dd, :]) + self.T * a\n", " )\n", " return self.phi, self.theta\n", "\n", " def _estimate_topic_prob(self, ii, d, C_wt, C_dt):\n", " \"\"\"\n", " Compute an approximation of the conditional probability that token ii\n", " is assigned to topic jj given all previous topic assignments and the\n", " current document d: p(t_i = j | t_{-i}, w_i, d_i)\n", " \"\"\"\n", " p_vec = np.zeros(self.T)\n", " b, a = self.beta[0], self.alpha[0]\n", " for jj in range(self.T):\n", " # prob of word ii under topic jj\n", " frac1 = (C_wt[ii, jj] + b) / (np.sum(C_wt[:, jj]) + self.V * b)\n", " # prob of topic jj under document d\n", " frac2 = (C_dt[d, jj] + a) / (np.sum(C_dt[d, :]) + self.T * a)\n", " p_vec[jj] = frac1 * frac2\n", " return p_vec / np.sum(p_vec)\n", "\n", " def _gibbs_sampler(self, n_gibbs, texts):\n", " \"\"\"\n", " Collapsed Gibbs sampler for estimating the posterior distribution over\n", " topic assignments.\n", " \"\"\"\n", " # Initialize count matrices\n", " C_wt = np.zeros([self.V, self.T])\n", " C_dt = np.zeros([self.D, self.T])\n", " assignments = np.zeros([self.N, n_gibbs + 1])\n", "\n", " # Randomly initialize topic assignments for words\n", " for ii in range(self.N):\n", " token_idx = np.concatenate(texts)[ii]\n", " assignments[ii, 0] = np.random.randint(0, self.T)\n", "\n", " doc = self.word_document[ii]\n", " C_dt[doc, assignments[ii, 0]] += 1\n", " C_wt[token_idx, assignments[ii, 0]] += 1\n", "\n", " # run collapsed Gibbs sampler\n", " for gg in range(n_gibbs):\n", " print(\"Gibbs iteration {} of {}\".format(gg + 1, n_gibbs))\n", " for jj in range(self.N):\n", " token_idx = np.concatenate(texts)[jj]\n", "\n", " # Decrement count matrices by 1\n", " doc = self.word_document[jj]\n", " C_wt[token_idx, assignments[jj, gg]] -= 1\n", " C_dt[doc, assignments[jj, gg]] -= 1\n", "\n", " # Draw new topic from our approximation of the conditional dist.\n", " p_topics = self._estimate_topic_prob(token_idx, doc, C_wt, C_dt)\n", " sampled_topic = np.nonzero(np.random.multinomial(1, p_topics))[0][0]\n", "\n", " # Update count matrices\n", " C_wt[token_idx, sampled_topic] += 1\n", " C_dt[doc, sampled_topic] += 1\n", " assignments[jj, gg + 1] = sampled_topic\n", " return C_wt, C_dt, assignments\n"]} {"path": "numpy_ml/tests/test_TFIDFEncoder.py", "content": ["# flake8: noqa\n", "from collections import Counter\n", "\n", "# gold-standard imports\n", "import huffman\n", "import numpy as np\n", "\n", "from scipy.fftpack import dct\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# try:\n", "# from librosa.core.time_frequency import fft_frequencies\n", "# except ImportError:\n", "# # for librosa >= 0.8.0\n", "# from librosa import fft_frequencies\n", "# from librosa.feature import mfcc as lr_mfcc\n", "# from librosa.util import frame\n", "# from librosa.filters import mel\n", "\n", "# numpy-ml implementations\n", "from numpy_ml.preprocessing.general import Standardizer\n", "from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder\n", "from numpy_ml.preprocessing.dsp import (\n", " DCT,\n", " DFT,\n", " mfcc,\n", " to_frames,\n", " mel_filterbank,\n", " dft_bins,\n", ")\n", "from numpy_ml.utils.testing import random_paragraph\n", "\n", "\n", "def test_tfidf(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " docs = []\n", " n_docs = np.random.randint(1, 10)\n", " for d in range(n_docs):\n", " n_lines = np.random.randint(1, 1000)\n", " lines = [random_paragraph(np.random.randint(1, 10)) for _ in range(n_lines)]\n", " docs.append(\"\\n\".join([\" \".join(l) for l in lines]))\n", "\n", " smooth = bool(np.random.randint(2))\n", "\n", " tfidf = TFIDFEncoder(\n", " lowercase=True,\n", " min_count=0,\n", " smooth_idf=smooth,\n", " max_tokens=None,\n", " input_type=\"strings\",\n", " filter_stopwords=False,\n", " )\n", " gold = TfidfVectorizer(\n", " input=\"content\",\n", " norm=None,\n", " use_idf=True,\n", " lowercase=True,\n", " smooth_idf=smooth,\n", " sublinear_tf=False,\n", " )\n", "\n", " tfidf.fit(docs)\n", " mine = tfidf.transform(ignore_special_chars=True)\n", " theirs = gold.fit_transform(docs).toarray()\n", "\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_LSTMCell.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "def test_LSTMCell(N=15):\n", " from numpy_ml.neural_nets.layers import LSTMCell\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 10)\n", " n_t = np.random.randint(1, 10)\n", " X = random_tensor((n_ex, n_in, n_t), standardize=True)\n", "\n", " # initialize LSTM layer\n", " L1 = LSTMCell(n_out=n_out)\n", "\n", " # forward prop\n", " Cs = []\n", " y_preds = []\n", " for t in range(n_t):\n", " y_pred, Ct = L1.forward(X[:, :, t])\n", " y_preds.append(y_pred)\n", " Cs.append(Ct)\n", "\n", " # backprop\n", " dLdX = []\n", " dLdAt = np.ones_like(y_preds[t])\n", " for t in reversed(range(n_t)):\n", " dLdXt = L1.backward(dLdAt)\n", " dLdX.insert(0, dLdXt)\n", " dLdX = np.dstack(dLdX)\n", " y_preds = np.dstack(y_preds)\n", " Cs = np.array(Cs)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchLSTMCell(n_in, n_out, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (X, \"X\"),\n", " (np.array(Cs), \"C\"),\n", " (y_preds, \"y\"),\n", " (L1.parameters[\"bo\"].T, \"bo\"),\n", " (L1.parameters[\"bu\"].T, \"bu\"),\n", " (L1.parameters[\"bf\"].T, \"bf\"),\n", " (L1.parameters[\"bc\"].T, \"bc\"),\n", " (L1.parameters[\"Wo\"], \"Wo\"),\n", " (L1.parameters[\"Wu\"], \"Wu\"),\n", " (L1.parameters[\"Wf\"], \"Wf\"),\n", " (L1.parameters[\"Wc\"], \"Wc\"),\n", " (L1.gradients[\"bo\"].T, \"dLdBo\"),\n", " (L1.gradients[\"bu\"].T, \"dLdBu\"),\n", " (L1.gradients[\"bf\"].T, \"dLdBf\"),\n", " (L1.gradients[\"bc\"].T, \"dLdBc\"),\n", " (L1.gradients[\"Wo\"], \"dLdWo\"),\n", " (L1.gradients[\"Wu\"], \"dLdWu\"),\n", " (L1.gradients[\"Wf\"], \"dLdWf\"),\n", " (L1.gradients[\"Wc\"], \"dLdWc\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Case {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " atol=1e-4,\n", " rtol=1e-4,\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Multiply.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "def test_MultiplyLayer(N=15):\n", " from numpy_ml.neural_nets.layers import Multiply\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " Xs = []\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " n_entries = np.random.randint(2, 5)\n", " for _ in range(n_entries):\n", " Xs.append(random_tensor((n_ex, n_in), standardize=True))\n", "\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Add layer\n", " L1 = Multiply(act_fn)\n", "\n", " # forward prop\n", " y_pred = L1.forward(Xs)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdXs = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchMultiplyLayer(torch_fn)\n", " golds = gold_mod.extract_grads(Xs)\n", "\n", " params = [(Xs, \"Xs\"), (y_pred, \"Y\")]\n", " params.extend(\n", " [(dldxi, \"dLdX{}\".format(i + 1)) for i, dldxi in enumerate(dLdXs)]\n", " )\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"n_ex={}, n_in={}\".format(n_ex, n_in))\n", " print(\"n_entries={}, act_fn={}\".format(n_entries, str(act_fn)))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_chebyshev.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def test_chebyshev(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = chebyshev(x, y)\n", " theirs = scipy.spatial.distance.chebyshev(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_DecisionTree.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn.datasets import make_regression, make_blobs\n", "from sklearn.model_selection import train_test_split\n", "\n", "from numpy_ml.trees.gbdt import GradientBoostedDecisionTree\n", "from numpy_ml.trees.dt import DecisionTree, Node, Leaf\n", "from numpy_ml.trees.rf import RandomForest\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def clone_tree(dtree):\n", " children_left = dtree.tree_.children_left\n", " children_right = dtree.tree_.children_right\n", " feature = dtree.tree_.feature\n", " threshold = dtree.tree_.threshold\n", " values = dtree.tree_.value\n", "\n", " def grow(node_id):\n", " l, r = children_left[node_id], children_right[node_id]\n", " if l == r:\n", " return Leaf(values[node_id].argmax())\n", " n = Node(None, None, (feature[node_id], threshold[node_id]))\n", " n.left = grow(l)\n", " n.right = grow(r)\n", " return n\n", "\n", " node_id = 0\n", " root = Node(None, None, (feature[node_id], threshold[node_id]))\n", " root.left = grow(children_left[node_id])\n", " root.right = grow(children_right[node_id])\n", " return root\n", "\n", "\n", "def compare_trees(mine, gold):\n", " clone = clone_tree(gold)\n", " mine = mine.root\n", "\n", " def test(mine, clone):\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature, \"Node {} not equal\".format(depth)\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", " depth = 0\n", " ok = True\n", " while ok:\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", "\n", "def test_DecisionTree(N=1):\n", " i = 1\n", " np.random.seed(12345)\n", " while i <= N:\n", " n_ex = np.random.randint(2, 100)\n", " n_feats = np.random.randint(2, 100)\n", " max_depth = np.random.randint(1, 5)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i\n", " )\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return 1 - accuracy_score(yp, y)\n", "\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = DecisionTree(\n", " classifier=classifier, max_depth=max_depth, criterion=criterion\n", " )\n", " gold = DecisionTreeClassifier(\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " splitter=\"best\",\n", " random_state=i,\n", " )\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = DecisionTree(\n", " criterion=criterion, max_depth=max_depth, classifier=classifier\n", " )\n", " gold = DecisionTreeRegressor(\n", " criterion=criterion, max_depth=max_depth, splitter=\"best\"\n", " )\n", "\n", " print(\"Trial {}\".format(i))\n", " print(\"\\tClassifier={}, criterion={}\".format(classifier, criterion))\n", " print(\"\\tmax_depth={}, n_feats={}, n_ex={}\".format(max_depth, n_feats, n_ex))\n", " if classifier:\n", " print(\"\\tn_classes: {}\".format(n_classes))\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " gold.fit(X, Y)\n", "\n", " # get preds on training set\n", " y_pred_mine = mine.predict(X)\n", " y_pred_gold = gold.predict(X)\n", "\n", " loss_mine = loss(y_pred_mine, Y)\n", " loss_gold = loss(y_pred_gold, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_gold_test = gold.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_gold_test = loss(y_pred_gold_test, Y_test)\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine, loss_gold)\n", " print(\"\\tLoss on training: {}\".format(loss_mine))\n", " except AssertionError as e:\n", " print(\"\\tTraining losses not equal:\\n{}\".format(e))\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)\n", " print(\"\\tLoss on test: {}\".format(loss_mine_test))\n", " except AssertionError as e:\n", " print(\"\\tTest losses not equal:\\n{}\".format(e))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Conv1D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "def test_Conv1D(N=15):\n", " from numpy_ml.neural_nets.layers import Conv1D\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " l_in = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_width = min(l_in, np.random.randint(1, 5))\n", " p, s = np.random.randint(0, 5), np.random.randint(1, 3)\n", " d = np.random.randint(0, 5)\n", "\n", " fc = f_width * (d + 1) - d\n", " l_out = int(1 + (l_in + 2 * p - fc) / s)\n", "\n", " if l_out <= 0:\n", " continue\n", "\n", " X = random_tensor((n_ex, l_in, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Conv2D layer\n", " L1 = Conv1D(\n", " out_ch=n_out,\n", " kernel_width=f_width,\n", " act_fn=act_fn,\n", " pad=p,\n", " stride=s,\n", " dilation=d,\n", " )\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchConv1DLayer(\n", " n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"pad={}, stride={}, f_width={}, n_ex={}\".format(p, s, f_width, n_ex))\n", " print(\"l_in={}, n_in={}\".format(l_in, n_in))\n", " print(\"l_out={}, n_out={}\".format(l_out, n_out))\n", " print(\"dilation={}\".format(d))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_pad1D(N=15):\n", " from numpy_ml.neural_nets.layers import Conv1D\n", " from .nn_torch_models import TorchCausalConv1d, torchify\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " p = np.random.choice([\"same\", \"causal\"])\n", " n_ex = np.random.randint(1, 10)\n", " l_in = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_width = min(l_in, np.random.randint(1, 5))\n", " s = np.random.randint(1, 3)\n", " d = np.random.randint(0, 5)\n", "\n", " X = random_tensor((n_ex, l_in, n_in), standardize=True)\n", " X_pad, _ = pad1D(X, p, kernel_width=f_width, stride=s, dilation=d)\n", "\n", " # initialize Conv2D layer\n", " L1 = Conv1D(out_ch=n_out, kernel_width=f_width, pad=0, stride=s, dilation=d)\n", "\n", " # forward prop\n", " try:\n", " y_pred = L1.forward(X_pad)\n", " except ValueError:\n", " continue\n", "\n", " # ignore n. output channels\n", " print(\"Trial {}\".format(i))\n", " print(\"p={} d={} s={} l_in={} f_width={}\".format(p, d, s, l_in, f_width))\n", " print(\"n_ex={} n_in={} n_out={}\".format(n_ex, n_in, n_out))\n", " assert y_pred.shape[:2] == X.shape[:2], print(\n", " \"y_pred.shape={} X.shape={}\".format(y_pred.shape, X.shape)\n", " )\n", "\n", " if p == \"causal\":\n", " gold = TorchCausalConv1d(\n", " in_channels=n_in,\n", " out_channels=n_out,\n", " kernel_size=f_width,\n", " stride=s,\n", " dilation=d + 1,\n", " bias=True,\n", " )\n", " if s != 1:\n", " print(\n", " \"TorchCausalConv1D does not do `same` padding for stride > 1. Skipping\"\n", " )\n", " continue\n", "\n", " XT = torchify(np.moveaxis(X, [0, 1, 2], [0, -1, -2]))\n", " else:\n", " gold = nn.Conv1d(\n", " in_channels=n_in,\n", " out_channels=n_out,\n", " kernel_size=f_width,\n", " padding=0,\n", " stride=s,\n", " dilation=d + 1,\n", " bias=True,\n", " )\n", " XT = torchify(np.moveaxis(X_pad, [0, 1, 2], [0, -1, -2]))\n", "\n", " # import weights and biases\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " b = L1.parameters[\"b\"]\n", " W = np.moveaxis(L1.parameters[\"W\"], [0, 1, 2], [-1, -2, -3])\n", " assert gold.weight.shape == W.shape\n", " assert gold.bias.shape == b.flatten().shape\n", "\n", " gold.weight = nn.Parameter(torch.FloatTensor(W))\n", " gold.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " outT = gold(XT)\n", " if outT.ndimension() == 2:\n", " import ipdb\n", "\n", " ipdb.set_trace()\n", "\n", " gold_out = np.moveaxis(outT.detach().numpy(), [0, 1, 2], [0, -1, -2])\n", " assert gold_out.shape[:2] == X.shape[:2]\n", "\n", " np.testing.assert_almost_equal(\n", " y_pred,\n", " gold_out,\n", " err_msg=err_fmt(\n", " [(y_pred.shape, \"out.shape\"), (y_pred, \"out\")],\n", " {\"out.shape\": gold_out.shape, \"out\": gold_out},\n", " 1,\n", " ),\n", " decimal=4,\n", " )\n", " print(\"PASSED\\n\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Pool2D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "def test_Pool2D(N=15):\n", " from numpy_ml.neural_nets.layers import Pool2D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 3)\n", " f_shape = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " p, s = np.random.randint(0, max(1, min(f_shape) // 2)), np.random.randint(1, 3)\n", " # mode = [\"max\", \"average\"][np.random.randint(0, 2)]\n", " mode = \"average\"\n", " out_rows = int(1 + (in_rows + 2 * p - f_shape[0]) / s)\n", " out_cols = int(1 + (in_cols + 2 * p - f_shape[1]) / s)\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", " print(\"\\nmode: {}\".format(mode))\n", " print(\"pad={}, stride={}, f_shape={}, n_ex={}\".format(p, s, f_shape, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"out_rows={}, out_cols={}, n_out={}\".format(out_rows, out_cols, n_in))\n", "\n", " # initialize Pool2D layer\n", " L1 = Pool2D(kernel_shape=f_shape, pad=p, stride=s, mode=mode)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchPool2DLayer(n_in, L1.hyperparameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [(L1.X[0], \"X\"), (y_pred, \"y\"), (dLdX, \"dLdX\")]\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_GPRegression.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier\n", "from sklearn.gaussian_process import GaussianProcessRegressor\n", "\n", "from numpy_ml.nonparametric.knn import KNN\n", "from numpy_ml.nonparametric.gp import GPRegression\n", "from numpy_ml.utils.distance_metrics import euclidean\n", "\n", "\n", "\n", "def test_gp_regression(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " alpha = np.random.rand()\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " K = np.random.randint(1, N)\n", " J = np.random.randint(1, 3)\n", "\n", " X = np.random.rand(N, M)\n", " y = np.random.rand(N, J)\n", " X_test = np.random.rand(K, M)\n", "\n", " gp = GPRegression(kernel=\"RBFKernel(sigma=1)\", alpha=alpha)\n", " gold = GaussianProcessRegressor(\n", " kernel=None, alpha=alpha, optimizer=None, normalize_y=False\n", " )\n", "\n", " gp.fit(X, y)\n", " gold.fit(X, y)\n", "\n", " preds, _ = gp.predict(X_test)\n", " gold_preds = gold.predict(X_test)\n", " np.testing.assert_almost_equal(preds.reshape(-1), gold_preds.reshape(-1))\n", "\n", " mll = gp.marginal_log_likelihood().reshape(-1)\n", " gold_mll = gold.log_marginal_likelihood().reshape(-1)\n", " np.testing.assert_almost_equal(mll, gold_mll)\n", "\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_Tanh.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_tanh_grad(N=50):\n", " from numpy_ml.neural_nets.activations import Tanh\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Tanh()\n", " gold = torch_gradient_generator(torch.tanh)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_WGAN_GPLoss.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Loss Functions #\n", "#######################################################################\n", "\n", "\n", "def test_WGAN_GP_loss(N=5):\n", " from numpy_ml.neural_nets.losses import WGAN_GPLoss\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N:\n", " lambda_ = np.random.randint(0, 10)\n", " n_ex = np.random.randint(1, 10)\n", " n_feats = np.random.randint(2, 10)\n", " Y_real = random_tensor([n_ex], standardize=True)\n", " Y_fake = random_tensor([n_ex], standardize=True)\n", " gradInterp = random_tensor([n_ex, n_feats], standardize=True)\n", "\n", " mine = WGAN_GPLoss(lambda_=lambda_)\n", " C_loss = mine(Y_fake, \"C\", Y_real, gradInterp)\n", " G_loss = mine(Y_fake, \"G\")\n", "\n", " C_dY_fake, dY_real, dGradInterp = mine.grad(Y_fake, \"C\", Y_real, gradInterp)\n", " G_dY_fake = mine.grad(Y_fake, \"G\")\n", "\n", " golds = TorchWGANGPLoss(lambda_).extract_grads(Y_real, Y_fake, gradInterp)\n", " if np.isnan(golds[\"C_dGradInterp\"]).any():\n", " continue\n", "\n", " params = [\n", " (Y_real, \"Y_real\"),\n", " (Y_fake, \"Y_fake\"),\n", " (gradInterp, \"gradInterp\"),\n", " (C_loss, \"C_loss\"),\n", " (G_loss, \"G_loss\"),\n", " (-dY_real, \"C_dY_real\"),\n", " (-C_dY_fake, \"C_dY_fake\"),\n", " (dGradInterp, \"C_dGradInterp\"),\n", " (G_dY_fake, \"G_dY_fake\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " rtol=0.1,\n", " atol=1e-2,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n"]} {"path": "numpy_ml/tests/test_MultiHeadedAttentionModule.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):\n", " \"\"\"\n", " Manual gradient calc for vanilla RNN parameters\n", " \"\"\"\n", " if param_name in [\"Ba\", \"Bx\"]:\n", " param_name = param_name.lower()\n", " elif param_name in [\"X\", \"y\"]:\n", " return None\n", "\n", " param_orig = model.parameters[param_name]\n", " model.flush_gradients()\n", " grads = np.zeros_like(param_orig)\n", "\n", " for flat_ix, val in enumerate(param_orig.flat):\n", " param = deepcopy(param_orig)\n", " md_ix = np.unravel_index(flat_ix, param.shape)\n", "\n", " # plus\n", " y_preds_plus = []\n", " param[md_ix] = val + epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_plus = model.forward(X[:, :, t])\n", " y_preds_plus += [y_pred_plus]\n", " loss_plus = loss_func(y_preds_plus)\n", " model.flush_gradients()\n", "\n", " # minus\n", " y_preds_minus = []\n", " param[md_ix] = val - epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_minus = model.forward(X[:, :, t])\n", " y_preds_minus += [y_pred_minus]\n", " loss_minus = loss_func(y_preds_minus)\n", " model.flush_gradients()\n", "\n", " grad = (loss_plus - loss_minus) / (2 * epsilon)\n", " grads[md_ix] = grad\n", " return grads.T\n", "\n", "\n", "#######################################################################\n", "# Modules #\n", "#######################################################################\n", "\n", "\n", "def test_MultiHeadedAttentionModule(N=15):\n", " from numpy_ml.neural_nets.modules import MultiHeadedAttentionModule\n", "\n", " N = np.inf if N is None else N\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " latent_dim = np.random.randint(1, 20)\n", " n_heads = np.random.randint(2, 10)\n", " d_k = d_v = n_heads * latent_dim\n", "\n", " Q = random_tensor((n_ex, d_k), standardize=True)\n", " K = random_tensor((n_ex, d_k), standardize=True)\n", " V = random_tensor((n_ex, d_v), standardize=True)\n", "\n", " mine = MultiHeadedAttentionModule(n_heads=n_heads, dropout_p=0)\n", "\n", " # forward prop\n", " y_pred = mine.forward(Q, K, V)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdQ, dLdK, dLdV = mine.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " params = mine.parameters\n", " hparams = mine.hyperparameters\n", " gold_mod = TorchMultiHeadedAttentionModule(params, hparams)\n", " golds = gold_mod.extract_grads(Q, K, V)\n", "\n", " dv = mine.derived_variables\n", " params = mine.parameters[\"components\"]\n", " grads = mine.gradients[\"components\"]\n", " params = [\n", " (Q, \"Q\"),\n", " (K, \"K\"),\n", " (V, \"V\"),\n", " (mine.n_heads, \"n_heads\"),\n", " (mine.latent_dim, \"latent_dim\"),\n", " (params[\"O\"][\"W\"], \"O_W\"),\n", " (params[\"K\"][\"W\"], \"K_W\"),\n", " (params[\"V\"][\"W\"], \"V_W\"),\n", " (params[\"Q\"][\"W\"], \"Q_W\"),\n", " (params[\"O\"][\"b\"], \"O_b\"),\n", " (params[\"K\"][\"b\"], \"K_b\"),\n", " (params[\"V\"][\"b\"], \"V_b\"),\n", " (params[\"Q\"][\"b\"], \"Q_b\"),\n", " (dv[\"Q_proj\"], \"Q_proj\"),\n", " (dv[\"K_proj\"], \"K_proj\"),\n", " (dv[\"V_proj\"], \"V_proj\"),\n", " (dv[\"attention_weights\"][0], \"weights\"),\n", " (dv[\"attention_out\"], \"attn_out\"),\n", " (y_pred, \"Y\"),\n", " (dLdy, \"dLdy\"),\n", " (dv[\"dQ_proj\"], \"dQ_proj\"),\n", " (dv[\"dK_proj\"], \"dK_proj\"),\n", " (dv[\"dV_proj\"], \"dV_proj\"),\n", " (grads[\"O\"][\"W\"], \"dO_W\"),\n", " (grads[\"V\"][\"W\"], \"dV_W\"),\n", " (grads[\"K\"][\"W\"], \"dK_W\"),\n", " (grads[\"Q\"][\"W\"], \"dQ_W\"),\n", " (grads[\"O\"][\"b\"], \"dO_b\"),\n", " (grads[\"V\"][\"b\"], \"dV_b\"),\n", " (grads[\"K\"][\"b\"], \"dK_b\"),\n", " (grads[\"Q\"][\"b\"], \"dQ_b\"),\n", " (dLdQ, \"dQ\"),\n", " (dLdK, \"dK\"),\n", " (dLdV, \"dV\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\n", " \"n_ex={} d_k=d_v={} latent_dim={} n_heads={}\".format(\n", " n_ex, d_k, latent_dim, n_heads\n", " )\n", " )\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Conv2D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "def test_Conv2D(N=15):\n", " from numpy_ml.neural_nets.layers import Conv2D\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_shape = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " p, s = np.random.randint(0, 5), np.random.randint(1, 3)\n", " d = np.random.randint(0, 5)\n", "\n", " fr, fc = f_shape[0] * (d + 1) - d, f_shape[1] * (d + 1) - d\n", " out_rows = int(1 + (in_rows + 2 * p - fr) / s)\n", " out_cols = int(1 + (in_cols + 2 * p - fc) / s)\n", "\n", " if out_rows <= 0 or out_cols <= 0:\n", " continue\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Conv2D layer\n", " L1 = Conv2D(\n", " out_ch=n_out,\n", " kernel_shape=f_shape,\n", " act_fn=act_fn,\n", " pad=p,\n", " stride=s,\n", " dilation=d,\n", " )\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchConv2DLayer(\n", " n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"pad={}, stride={}, f_shape={}, n_ex={}\".format(p, s, f_shape, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"out_rows={}, out_cols={}, n_out={}\".format(out_rows, out_cols, n_out))\n", " print(\"dilation={}\".format(d))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_BatchNorm2D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "def test_BatchNorm2D(N=15):\n", " from numpy_ml.neural_nets.layers import BatchNorm2D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 3)\n", "\n", " # initialize BatchNorm2D layer\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", " L1 = BatchNorm2D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # standard sum loss\n", " dLdy = np.ones_like(X)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchBatchNormLayer(\n", " n_in, L1.parameters, mode=\"2D\", epsilon=L1.epsilon, momentum=L1.momentum\n", " )\n", " golds = gold_mod.extract_grads(X, Y_true=None)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (L1.hyperparameters[\"momentum\"], \"momentum\"),\n", " (L1.hyperparameters[\"epsilon\"], \"epsilon\"),\n", " (L1.parameters[\"scaler\"].T, \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (L1.parameters[\"running_mean\"], \"running_mean\"),\n", " # (L1.parameters[\"running_var\"], \"running_var\"),\n", " (y_pred, \"y\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", "\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_MLENGram.py", "content": ["# flake8: noqa\n", "import tempfile\n", "\n", "import nltk\n", "import numpy as np\n", "\n", "from ..preprocessing.nlp import tokenize_words\n", "from ..ngram import AdditiveNGram, MLENGram\n", "from ..utils.testing import random_paragraph\n", "\n", "\n", "class MLEGold:\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True\n", " ):\n", " self.N = N\n", " self.K = K\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"K\": K,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " N = self.N\n", " H = self.hyperparameters\n", " models, counts = {}, {}\n", " grams = {n: [] for n in range(1, N + 1)}\n", " gg = {n: [] for n in range(1, N + 1)}\n", " filter_punc, filter_stop = H[\"filter_punctuation\"], H[\"filter_stopwords\"]\n", "\n", " n_words = 0\n", " tokens = set([])\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " words = tokenize_words(line, filter_punc, filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for n in range(1, N + 1):\n", " grams[n].append(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", "\n", " gg[n].extend(\n", " list(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", " )\n", "\n", " for n in range(1, N + 1):\n", " counts[n] = nltk.FreqDist(gg[n])\n", " models[n] = nltk.lm.MLE(order=n)\n", " models[n].fit(grams[n], tokens)\n", "\n", " self.counts = counts\n", " self.n_words = n_words\n", " self._models = models\n", " self.n_tokens = len(vocab) if vocab is not None else len(tokens)\n", "\n", " def log_prob(self, words, N):\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in nltk.ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _log_ngram_prob(self, ngram):\n", " N = len(ngram)\n", " return self._models[N].logscore(ngram[-1], ngram[:-1])\n", "\n", "\n", "class AdditiveGold:\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True\n", " ):\n", " self.N = N\n", " self.K = K\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"K\": K,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " N = self.N\n", " H = self.hyperparameters\n", " models, counts = {}, {}\n", " grams = {n: [] for n in range(1, N + 1)}\n", " gg = {n: [] for n in range(1, N + 1)}\n", " filter_punc, filter_stop = H[\"filter_punctuation\"], H[\"filter_stopwords\"]\n", "\n", " n_words = 0\n", " tokens = set()\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " words = tokenize_words(line, filter_punc, filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for n in range(1, N + 1):\n", " grams[n].append(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", "\n", " gg[n].extend(\n", " list(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", " )\n", "\n", " for n in range(1, N + 1):\n", " counts[n] = nltk.FreqDist(gg[n])\n", " models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)\n", " models[n].fit(grams[n], tokens)\n", "\n", " self.counts = counts\n", " self._models = models\n", " self.n_words = n_words\n", " self.n_tokens = len(vocab) if vocab is not None else len(tokens)\n", "\n", " def log_prob(self, words, N):\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in nltk.ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _log_ngram_prob(self, ngram):\n", " N = len(ngram)\n", " return self._models[N].logscore(ngram[-1], ngram[:-1])\n", "\n", "\n", "def test_mle():\n", " N = np.random.randint(2, 5)\n", " gold = MLEGold(N, unk=True, filter_stopwords=False, filter_punctuation=False)\n", " mine = MLENGram(N, unk=True, filter_stopwords=False, filter_punctuation=False)\n", "\n", " with tempfile.NamedTemporaryFile() as temp:\n", " temp.write(bytes(\" \".join(random_paragraph(1000)), encoding=\"utf-8-sig\"))\n", " gold.train(temp.name, encoding=\"utf-8-sig\")\n", " mine.train(temp.name, encoding=\"utf-8-sig\")\n", "\n", " for k in mine.counts[N].keys():\n", " if k[0] == k[1] and k[0] in (\"\", \"\"):\n", " continue\n", "\n", " err_str = \"{}, mine: {}, gold: {}\"\n", " assert mine.counts[N][k] == gold.counts[N][k], err_str.format(\n", " k, mine.counts[N][k], gold.counts[N][k]\n", " )\n", "\n", " M = mine.log_prob(k, N)\n", " G = gold.log_prob(k, N) / np.log2(np.e) # convert to log base e\n", " np.testing.assert_allclose(M, G)\n", " print(\"PASSED\")"]} {"path": "numpy_ml/tests/test_GaussianNBClassifier.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.model_selection import train_test_split\n", "\n", "from sklearn import naive_bayes\n", "\n", "from numpy_ml.linear_models import GaussianNBClassifier\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def test_GaussianNB(N=10):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " eps = np.finfo(float).eps\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 300)\n", " n_feats = np.random.randint(1, 100)\n", " n_classes = np.random.randint(2, 10)\n", "\n", " X = random_tensor((n_ex, n_feats), standardize=True)\n", " y = np.random.randint(0, n_classes, size=n_ex)\n", "\n", " X_test = random_tensor((n_ex, n_feats), standardize=True)\n", "\n", " NB = GaussianNBClassifier(eps=1e-09)\n", " NB.fit(X, y)\n", "\n", " preds = NB.predict(X_test)\n", "\n", " sklearn_NB = naive_bayes.GaussianNB()\n", " sklearn_NB.fit(X, y)\n", "\n", " sk_preds = sklearn_NB.predict(X_test)\n", "\n", " for j in range(len(NB.labels)):\n", " P = NB.parameters\n", " jointi = np.log(sklearn_NB.class_prior_[j])\n", " jointi_mine = np.log(P[\"prior\"][j])\n", "\n", " np.testing.assert_almost_equal(jointi, jointi_mine)\n", "\n", " n_jk = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[j, :] + eps))\n", " n_jk_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P[\"sigma\"][j] + eps))\n", "\n", " np.testing.assert_almost_equal(n_jk_mine, n_jk)\n", "\n", " n_jk2 = n_jk - 0.5 * np.sum(\n", " ((X_test - sklearn_NB.theta_[j, :]) ** 2) / (sklearn_NB.sigma_[j, :]), 1\n", " )\n", "\n", " n_jk2_mine = n_jk_mine - 0.5 * np.sum(\n", " ((X_test - P[\"mean\"][j]) ** 2) / (P[\"sigma\"][j]), 1\n", " )\n", " np.testing.assert_almost_equal(n_jk2_mine, n_jk2, decimal=4)\n", "\n", " llh = jointi + n_jk2\n", " llh_mine = jointi_mine + n_jk2_mine\n", "\n", " np.testing.assert_almost_equal(llh_mine, llh, decimal=4)\n", "\n", " np.testing.assert_almost_equal(P[\"prior\"], sklearn_NB.class_prior_)\n", " np.testing.assert_almost_equal(P[\"mean\"], sklearn_NB.theta_)\n", " np.testing.assert_almost_equal(P[\"sigma\"], sklearn_NB.sigma_)\n", " np.testing.assert_almost_equal(\n", " sklearn_NB._joint_log_likelihood(X_test),\n", " NB._log_posterior(X_test),\n", " decimal=4,\n", " )\n", " np.testing.assert_almost_equal(preds, sk_preds)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_BatchNorm1D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def test_BatchNorm1D(N=15):\n", " from numpy_ml.neural_nets.layers import BatchNorm1D\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 1000)\n", " n_in = np.random.randint(1, 1000)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # initialize BatchNorm1D layer\n", " L1 = BatchNorm1D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchBatchNormLayer(\n", " n_in, L1.parameters, \"1D\", epsilon=L1.epsilon, momentum=L1.momentum\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"scaler\"].T, \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (L1.parameters[\"running_mean\"], \"running_mean\"),\n", " # (L1.parameters[\"running_var\"], \"running_var\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_RBFKernel.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "#######################################################################\n", "# Kernels #\n", "#######################################################################\n", "\n", "\n", "def test_radial_basis_kernel(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " M = np.random.randint(1, 100)\n", " C = np.random.randint(1, 1000)\n", " gamma = np.random.rand()\n", "\n", " X = np.random.rand(N, C)\n", " Y = np.random.rand(M, C)\n", "\n", " # sklearn (gamma) <-> mine (sigma) conversion:\n", " # gamma = 1 / (2 * sigma^2)\n", " # sigma = np.sqrt(1 / 2 * gamma)\n", "\n", " mine = RBFKernel(sigma=np.sqrt(1 / (2 * gamma)))(X, Y)\n", " gold = sk_rbf(X, Y, gamma=gamma)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_GradientBoostedDecisionTree.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn.datasets import make_regression, make_blobs\n", "from sklearn.model_selection import train_test_split\n", "\n", "from numpy_ml.trees.gbdt import GradientBoostedDecisionTree\n", "from numpy_ml.trees.dt import DecisionTree, Node, Leaf\n", "from numpy_ml.trees.rf import RandomForest\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def clone_tree(dtree):\n", " children_left = dtree.tree_.children_left\n", " children_right = dtree.tree_.children_right\n", " feature = dtree.tree_.feature\n", " threshold = dtree.tree_.threshold\n", " values = dtree.tree_.value\n", "\n", " def grow(node_id):\n", " l, r = children_left[node_id], children_right[node_id]\n", " if l == r:\n", " return Leaf(values[node_id].argmax())\n", " n = Node(None, None, (feature[node_id], threshold[node_id]))\n", " n.left = grow(l)\n", " n.right = grow(r)\n", " return n\n", "\n", " node_id = 0\n", " root = Node(None, None, (feature[node_id], threshold[node_id]))\n", " root.left = grow(children_left[node_id])\n", " root.right = grow(children_right[node_id])\n", " return root\n", "\n", "\n", "def compare_trees(mine, gold):\n", " clone = clone_tree(gold)\n", " mine = mine.root\n", "\n", " def test(mine, clone):\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature, \"Node {} not equal\".format(depth)\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", " depth = 0\n", " ok = True\n", " while ok:\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", "\n", "def test_gbdt(N=1):\n", " np.random.seed(12345)\n", " i = 1\n", " while i <= N:\n", " n_ex = np.random.randint(2, 100)\n", " n_feats = np.random.randint(2, 100)\n", " n_trees = np.random.randint(2, 100)\n", " max_depth = np.random.randint(1, 5)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i\n", " )\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return 1 - accuracy_score(yp, y)\n", "\n", " # initialize model\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = GradientBoostedDecisionTree(\n", " n_iter=n_trees,\n", " classifier=classifier,\n", " max_depth=max_depth,\n", " learning_rate=0.1,\n", " loss=\"crossentropy\",\n", " step_size=\"constant\",\n", " )\n", " gold = RandomForestClassifier(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = GradientBoostedDecisionTree(\n", " n_iter=n_trees,\n", " max_depth=max_depth,\n", " classifier=classifier,\n", " learning_rate=0.1,\n", " loss=\"mse\",\n", " step_size=\"constant\",\n", " )\n", " gold = RandomForestRegressor(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", "\n", " print(\"Trial {}\".format(i))\n", " print(\"\\tClassifier={}, criterion={}\".format(classifier, criterion))\n", " print(\"\\tmax_depth={}, n_feats={}, n_ex={}\".format(max_depth, n_feats, n_ex))\n", " if classifier:\n", " print(\"\\tn_classes: {}\".format(n_classes))\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " gold.fit(X, Y)\n", "\n", " # get preds\n", " y_pred_mine = mine.predict(X)\n", " y_pred_gold = gold.predict(X)\n", "\n", " loss_mine = loss(y_pred_mine, Y)\n", " loss_gold = loss(y_pred_gold, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_gold_test = gold.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_gold_test = loss(y_pred_gold_test, Y_test)\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine, loss_gold)\n", " print(\"\\tLoss on training: {}\".format(loss_mine))\n", " except AssertionError as e:\n", " print(\"\\tTraining losses not equal:\\n{}\".format(e))\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)\n", " print(\"\\tLoss on test: {}\".format(loss_mine_test))\n", " except AssertionError as e:\n", " print(\"\\tTest losses not equal:\\n{}\".format(e))\n", "\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_LayerNorm2D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "def test_LayerNorm2D(N=15):\n", " from numpy_ml.neural_nets.layers import LayerNorm2D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 3)\n", "\n", " # initialize LayerNorm2D layer\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", " L1 = LayerNorm2D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # standard sum loss\n", " dLdy = np.ones_like(X)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchLayerNormLayer(\n", " [n_in, in_rows, in_cols], L1.parameters, mode=\"2D\", epsilon=L1.epsilon\n", " )\n", " golds = gold_mod.extract_grads(X, Y_true=None)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (L1.hyperparameters[\"epsilon\"], \"epsilon\"),\n", " (L1.parameters[\"scaler\"], \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (y_pred, \"y\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", "\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_WavenetResidualModule.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):\n", " \"\"\"\n", " Manual gradient calc for vanilla RNN parameters\n", " \"\"\"\n", " if param_name in [\"Ba\", \"Bx\"]:\n", " param_name = param_name.lower()\n", " elif param_name in [\"X\", \"y\"]:\n", " return None\n", "\n", " param_orig = model.parameters[param_name]\n", " model.flush_gradients()\n", " grads = np.zeros_like(param_orig)\n", "\n", " for flat_ix, val in enumerate(param_orig.flat):\n", " param = deepcopy(param_orig)\n", " md_ix = np.unravel_index(flat_ix, param.shape)\n", "\n", " # plus\n", " y_preds_plus = []\n", " param[md_ix] = val + epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_plus = model.forward(X[:, :, t])\n", " y_preds_plus += [y_pred_plus]\n", " loss_plus = loss_func(y_preds_plus)\n", " model.flush_gradients()\n", "\n", " # minus\n", " y_preds_minus = []\n", " param[md_ix] = val - epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_minus = model.forward(X[:, :, t])\n", " y_preds_minus += [y_pred_minus]\n", " loss_minus = loss_func(y_preds_minus)\n", " model.flush_gradients()\n", "\n", " grad = (loss_plus - loss_minus) / (2 * epsilon)\n", " grads[md_ix] = grad\n", " return grads.T\n", "\n", "\n", "#######################################################################\n", "# Modules #\n", "#######################################################################\n", "\n", "\n", "\n", "\n", "def test_WaveNetModule(N=10):\n", " from numpy_ml.neural_nets.modules import WavenetResidualModule\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " l_in = np.random.randint(1, 10)\n", " ch_residual, ch_dilation = np.random.randint(1, 5), np.random.randint(1, 5)\n", " f_width = min(l_in, np.random.randint(1, 5))\n", " d = np.random.randint(0, 5)\n", "\n", " X_main = np.zeros_like(\n", " random_tensor((n_ex, l_in, ch_residual), standardize=True)\n", " )\n", " X_main[0][0][0] = 1.0\n", " X_skip = np.zeros_like(\n", " random_tensor((n_ex, l_in, ch_residual), standardize=True)\n", " )\n", "\n", " # initialize Conv2D layer\n", " L1 = WavenetResidualModule(\n", " ch_residual=ch_residual,\n", " ch_dilation=ch_dilation,\n", " kernel_width=f_width,\n", " dilation=d,\n", " )\n", "\n", " # forward prop\n", " Y_main, Y_skip = L1.forward(X_main, X_skip)\n", "\n", " # backprop\n", " dLdY_skip = np.ones_like(Y_skip)\n", " dLdY_main = np.ones_like(Y_main)\n", " dLdX_main, dLdX_skip = L1.backward(dLdY_skip, dLdY_main)\n", "\n", " _, conv_1x1_pad = pad1D(\n", " L1._dv[\"multiply_gate_out\"], \"same\", kernel_width=1, stride=1, dilation=0\n", " )\n", " if conv_1x1_pad[0] != conv_1x1_pad[1]:\n", " print(\"Skipping\")\n", " continue\n", "\n", " conv_1x1_pad = conv_1x1_pad[0]\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchWavenetModule(L1.parameters, L1.hyperparameters, conv_1x1_pad)\n", " golds = gold_mod.extract_grads(X_main, X_skip)\n", "\n", " dv = L1.derived_variables\n", " pc = L1.parameters[\"components\"]\n", " gr = L1.gradients[\"components\"]\n", "\n", " params = [\n", " (L1.X_main, \"X_main\"),\n", " (L1.X_skip, \"X_skip\"),\n", " (pc[\"conv_dilation\"][\"W\"], \"conv_dilation_W\"),\n", " (pc[\"conv_dilation\"][\"b\"], \"conv_dilation_b\"),\n", " (pc[\"conv_1x1\"][\"W\"], \"conv_1x1_W\"),\n", " (pc[\"conv_1x1\"][\"b\"], \"conv_1x1_b\"),\n", " (dv[\"conv_dilation_out\"], \"conv_dilation_out\"),\n", " (dv[\"tanh_out\"], \"tanh_out\"),\n", " (dv[\"sigm_out\"], \"sigm_out\"),\n", " (dv[\"multiply_gate_out\"], \"multiply_gate_out\"),\n", " (dv[\"conv_1x1_out\"], \"conv_1x1_out\"),\n", " (Y_main, \"Y_main\"),\n", " (Y_skip, \"Y_skip\"),\n", " (dLdY_skip, \"dLdY_skip\"),\n", " (dLdY_main, \"dLdY_main\"),\n", " (dv[\"dLdConv_1x1\"], \"dLdConv_1x1_out\"),\n", " (gr[\"conv_1x1\"][\"W\"], \"dLdConv_1x1_W\"),\n", " (gr[\"conv_1x1\"][\"b\"], \"dLdConv_1x1_b\"),\n", " (dv[\"dLdMultiply\"], \"dLdMultiply_out\"),\n", " (dv[\"dLdTanh\"], \"dLdTanh_out\"),\n", " (dv[\"dLdSigmoid\"], \"dLdSigm_out\"),\n", " (dv[\"dLdConv_dilation\"], \"dLdConv_dilation_out\"),\n", " (gr[\"conv_dilation\"][\"W\"], \"dLdConv_dilation_W\"),\n", " (gr[\"conv_dilation\"][\"b\"], \"dLdConv_dilation_b\"),\n", " (dLdX_main, \"dLdX_main\"),\n", " (dLdX_skip, \"dLdX_skip\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"f_width={}, n_ex={}\".format(f_width, n_ex))\n", " print(\"l_in={}, ch_residual={}\".format(l_in, ch_residual))\n", " print(\"ch_dilation={} dilation={}\".format(ch_dilation, d))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Utils #\n", "#######################################################################\n", "\n"]} {"path": "numpy_ml/tests/test_Standardizer.py", "content": ["# flake8: noqa\n", "from collections import Counter\n", "\n", "# gold-standard imports\n", "import huffman\n", "import numpy as np\n", "\n", "from scipy.fftpack import dct\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# try:\n", "# from librosa.core.time_frequency import fft_frequencies\n", "# except ImportError:\n", "# # for librosa >= 0.8.0\n", "# from librosa import fft_frequencies\n", "# from librosa.feature import mfcc as lr_mfcc\n", "# from librosa.util import frame\n", "# from librosa.filters import mel\n", "\n", "# numpy-ml implementations\n", "from numpy_ml.preprocessing.general import Standardizer\n", "from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder\n", "from numpy_ml.preprocessing.dsp import (\n", " DCT,\n", " DFT,\n", " mfcc,\n", " to_frames,\n", " mel_filterbank,\n", " dft_bins,\n", ")\n", "from numpy_ml.utils.testing import random_paragraph\n", "\n", "\n", "\n", "def test_standardizer(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " mean = bool(np.random.randint(2))\n", " std = bool(np.random.randint(2))\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " X = np.random.rand(N, M)\n", "\n", " S = Standardizer(with_mean=mean, with_std=std)\n", " S.fit(X)\n", " mine = S.transform(X)\n", "\n", " theirs = StandardScaler(with_mean=mean, with_std=std)\n", " gold = theirs.fit_transform(X)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_DFT.py", "content": ["# flake8: noqa\n", "from collections import Counter\n", "\n", "# gold-standard imports\n", "import huffman\n", "import numpy as np\n", "\n", "from scipy.fftpack import dct\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# try:\n", "# from librosa.core.time_frequency import fft_frequencies\n", "# except ImportError:\n", "# # for librosa >= 0.8.0\n", "# from librosa import fft_frequencies\n", "# from librosa.feature import mfcc as lr_mfcc\n", "# from librosa.util import frame\n", "# from librosa.filters import mel\n", "\n", "# numpy-ml implementations\n", "from numpy_ml.preprocessing.general import Standardizer\n", "from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder\n", "from numpy_ml.preprocessing.dsp import (\n", " DCT,\n", " DFT,\n", " mfcc,\n", " to_frames,\n", " mel_filterbank,\n", " dft_bins,\n", ")\n", "from numpy_ml.utils.testing import random_paragraph\n", "\n", "\n", "def test_dft(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " signal = np.random.rand(N)\n", " mine = DFT(signal)\n", " theirs = np.fft.rfft(signal)\n", "\n", " np.testing.assert_almost_equal(mine.real, theirs.real)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_euclidean.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def test_euclidean(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = euclidean(x, y)\n", " theirs = scipy.spatial.distance.euclidean(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_LayerNorm1D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "\n", "def test_LayerNorm1D(N=15):\n", " from numpy_ml.neural_nets.layers import LayerNorm1D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 1000)\n", " n_in = np.random.randint(1, 1000)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # initialize BatchNorm1D layer\n", " L1 = LayerNorm1D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchLayerNormLayer(n_in, L1.parameters, \"1D\", epsilon=L1.epsilon)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"scaler\"].T, \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_FullyConnected.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def test_FullyConnected(N=15):\n", " from numpy_ml.neural_nets.layers import FullyConnected\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " n_out = np.random.randint(1, 100)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize FC layer\n", " L1 = FullyConnected(n_out=n_out, act_fn=act_fn)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchFCLayer(n_in, n_out, torch_fn, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"].T, \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (dLdy, \"dLdy\"),\n", " (L1.gradients[\"W\"].T, \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\\nact_fn={}\".format(i, act_fn_name))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_LinearRegression.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.linear_model import LinearRegression as LinearRegressionGold\n", "\n", "from numpy_ml.linear_models import LinearRegression\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def test_linear_regression(N=10):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " train_samples = np.random.randint(2, 30)\n", " update_samples = np.random.randint(1, 30)\n", " n_samples = train_samples + update_samples\n", "\n", " # ensure n_feats < train_samples, otherwise multiple solutions are\n", " # possible\n", " n_feats = np.random.randint(1, train_samples)\n", " target_dim = np.random.randint(1, 10)\n", "\n", " fit_intercept = np.random.choice([True, False])\n", "\n", " X = random_tensor((n_samples, n_feats), standardize=True)\n", " y = random_tensor((n_samples, target_dim), standardize=True)\n", "\n", " weighted = np.random.choice([True, False])\n", " weights = np.random.rand(n_samples) if weighted else np.ones(n_samples)\n", "\n", " X_train, X_update = X[:train_samples], X[train_samples:]\n", " y_train, y_update = y[:train_samples], y[train_samples:]\n", " w_train, w_update = weights[:train_samples], weights[train_samples:]\n", "\n", " print(f\"Weights: {weighted}\")\n", " print(f\"Fit intercept: {fit_intercept}\")\n", "\n", " # Fit gold standard model on the entire dataset\n", " lr_gold = LinearRegressionGold(fit_intercept=fit_intercept)\n", " lr_gold.fit(X, y, sample_weight=weights)\n", "\n", " lr_mine = LinearRegression(fit_intercept=fit_intercept)\n", " lr_mine.fit(X, y, weights=weights)\n", "\n", " # check that model predictions match\n", " np.testing.assert_almost_equal(\n", " lr_mine.predict(X), lr_gold.predict(X), decimal=5\n", " )\n", " print(\"\\t1. Overall model predictions match\")\n", "\n", " # check that model coefficients match\n", " beta = lr_mine.beta.T[:, 1:] if fit_intercept else lr_mine.beta.T\n", " np.testing.assert_almost_equal(beta, lr_gold.coef_, decimal=6)\n", " print(\"\\t2. Overall model coefficients match\")\n", "\n", " # Fit our model on just (X_train, y_train)...\n", " lr = LinearRegression(fit_intercept=fit_intercept)\n", " lr.fit(X_train, y_train, weights=w_train)\n", "\n", " do_single_sample_update = np.random.choice([True, False])\n", "\n", " # ...then update our model on the examples (X_update, y_update)\n", " if do_single_sample_update:\n", " for x_new, y_new, w_new in zip(X_update, y_update, w_update):\n", " lr.update(x_new, y_new, w_new)\n", " else:\n", " lr.update(X_update, y_update, w_update)\n", "\n", " # check that model predictions match\n", " np.testing.assert_almost_equal(lr.predict(X), lr_gold.predict(X), decimal=5)\n", " print(\"\\t3. Iterative model predictions match\")\n", "\n", " # check that model coefficients match\n", " beta = lr.beta.T[:, 1:] if fit_intercept else lr.beta.T\n", " np.testing.assert_almost_equal(beta, lr_gold.coef_, decimal=6)\n", " print(\"\\t4. Iterative model coefficients match\")\n", "\n", " print(\"\\tPASSED\\n\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/__init__.py", "content": ["\"\"\"Unit tests for various numpy-ml modules\"\"\"\n"]} {"path": "numpy_ml/tests/test_LinearKernel.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "#######################################################################\n", "# Kernels #\n", "#######################################################################\n", "\n", "\n", "def test_linear_kernel(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " M = np.random.randint(1, 100)\n", " C = np.random.randint(1, 1000)\n", "\n", " X = np.random.rand(N, C)\n", " Y = np.random.rand(M, C)\n", "\n", " mine = LinearKernel()(X, Y)\n", " gold = sk_linear(X, Y)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Add.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "def test_AddLayer(N=15):\n", " from numpy_ml.neural_nets.layers import Add\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " Xs = []\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " n_entries = np.random.randint(2, 5)\n", " for _ in range(n_entries):\n", " Xs.append(random_tensor((n_ex, n_in), standardize=True))\n", "\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Add layer\n", " L1 = Add(act_fn)\n", "\n", " # forward prop\n", " y_pred = L1.forward(Xs)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdXs = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchAddLayer(torch_fn)\n", " golds = gold_mod.extract_grads(Xs)\n", "\n", " params = [(Xs, \"Xs\"), (y_pred, \"Y\")]\n", " params.extend(\n", " [(dldxi, \"dLdX{}\".format(i + 1)) for i, dldxi in enumerate(dLdXs)]\n", " )\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"n_ex={}, n_in={}\".format(n_ex, n_in))\n", " print(\"n_entries={}, act_fn={}\".format(n_entries, str(act_fn)))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_GELU.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "def test_gelu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import GELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " approx = np.random.choice([True, False])\n", "\n", " mine = GELU(approximate=False)\n", " mine_approx = GELU(approximate=True)\n", " gold = lambda z: F.gelu(torch.FloatTensor(z)).numpy()\n", " np.testing.assert_allclose(mine.fn(z), gold(z), rtol=1e-3)\n", " assert_almost_equal(mine.fn(z), mine_approx.fn(z))\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "\n", "def test_gelu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import GELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = GELU(approximate=False)\n", " mine_approx = GELU(approximate=True)\n", " gold = torch_gradient_generator(F.gelu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=3)\n", " assert_almost_equal(mine.grad(z), mine_approx.grad(z))\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_DCT.py", "content": ["# flake8: noqa\n", "from collections import Counter\n", "\n", "# gold-standard imports\n", "import huffman\n", "import numpy as np\n", "\n", "from scipy.fftpack import dct\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# try:\n", "# from librosa.core.time_frequency import fft_frequencies\n", "# except ImportError:\n", "# # for librosa >= 0.8.0\n", "# from librosa import fft_frequencies\n", "# from librosa.feature import mfcc as lr_mfcc\n", "# from librosa.util import frame\n", "# from librosa.filters import mel\n", "\n", "# numpy-ml implementations\n", "from numpy_ml.preprocessing.general import Standardizer\n", "from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder\n", "from numpy_ml.preprocessing.dsp import (\n", " DCT,\n", " DFT,\n", " mfcc,\n", " to_frames,\n", " mel_filterbank,\n", " dft_bins,\n", ")\n", "from numpy_ml.utils.testing import random_paragraph\n", "\n", "\n", "def test_dct(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " signal = np.random.rand(N)\n", " ortho = bool(np.random.randint(2))\n", " mine = DCT(signal, orthonormal=ortho)\n", " theirs = dct(signal, norm=\"ortho\" if ortho else None)\n", "\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_minkowski.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def test_minkowski(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " p = 1 + np.random.rand() * 10\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = minkowski(x, y, p)\n", " theirs = scipy.spatial.distance.minkowski(x, y, p)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_BidirectionalLSTM.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):\n", " \"\"\"\n", " Manual gradient calc for vanilla RNN parameters\n", " \"\"\"\n", " if param_name in [\"Ba\", \"Bx\"]:\n", " param_name = param_name.lower()\n", " elif param_name in [\"X\", \"y\"]:\n", " return None\n", "\n", " param_orig = model.parameters[param_name]\n", " model.flush_gradients()\n", " grads = np.zeros_like(param_orig)\n", "\n", " for flat_ix, val in enumerate(param_orig.flat):\n", " param = deepcopy(param_orig)\n", " md_ix = np.unravel_index(flat_ix, param.shape)\n", "\n", " # plus\n", " y_preds_plus = []\n", " param[md_ix] = val + epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_plus = model.forward(X[:, :, t])\n", " y_preds_plus += [y_pred_plus]\n", " loss_plus = loss_func(y_preds_plus)\n", " model.flush_gradients()\n", "\n", " # minus\n", " y_preds_minus = []\n", " param[md_ix] = val - epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_minus = model.forward(X[:, :, t])\n", " y_preds_minus += [y_pred_minus]\n", " loss_minus = loss_func(y_preds_minus)\n", " model.flush_gradients()\n", "\n", " grad = (loss_plus - loss_minus) / (2 * epsilon)\n", " grads[md_ix] = grad\n", " return grads.T\n", "\n", "\n", "#######################################################################\n", "# Modules #\n", "#######################################################################\n", "\n", "\n", "\n", "\n", "def test_BidirectionalLSTM(N=15):\n", " from numpy_ml.neural_nets.modules import BidirectionalLSTM\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 10)\n", " n_t = np.random.randint(1, 10)\n", " X = random_tensor((n_ex, n_in, n_t), standardize=True)\n", "\n", " # initialize LSTM layer\n", " L1 = BidirectionalLSTM(n_out=n_out)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdA = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdA)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchBidirectionalLSTM(n_in, n_out, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " pms, grads = L1.parameters[\"components\"], L1.gradients[\"components\"]\n", " params = [\n", " (X, \"X\"),\n", " (y_pred, \"y\"),\n", " (pms[\"cell_fwd\"][\"bo\"].T, \"bo_f\"),\n", " (pms[\"cell_fwd\"][\"bu\"].T, \"bu_f\"),\n", " (pms[\"cell_fwd\"][\"bf\"].T, \"bf_f\"),\n", " (pms[\"cell_fwd\"][\"bc\"].T, \"bc_f\"),\n", " (pms[\"cell_fwd\"][\"Wo\"], \"Wo_f\"),\n", " (pms[\"cell_fwd\"][\"Wu\"], \"Wu_f\"),\n", " (pms[\"cell_fwd\"][\"Wf\"], \"Wf_f\"),\n", " (pms[\"cell_fwd\"][\"Wc\"], \"Wc_f\"),\n", " (pms[\"cell_bwd\"][\"bo\"].T, \"bo_b\"),\n", " (pms[\"cell_bwd\"][\"bu\"].T, \"bu_b\"),\n", " (pms[\"cell_bwd\"][\"bf\"].T, \"bf_b\"),\n", " (pms[\"cell_bwd\"][\"bc\"].T, \"bc_b\"),\n", " (pms[\"cell_bwd\"][\"Wo\"], \"Wo_b\"),\n", " (pms[\"cell_bwd\"][\"Wu\"], \"Wu_b\"),\n", " (pms[\"cell_bwd\"][\"Wf\"], \"Wf_b\"),\n", " (pms[\"cell_bwd\"][\"Wc\"], \"Wc_b\"),\n", " (grads[\"cell_fwd\"][\"bo\"].T, \"dLdBo_f\"),\n", " (grads[\"cell_fwd\"][\"bu\"].T, \"dLdBu_f\"),\n", " (grads[\"cell_fwd\"][\"bf\"].T, \"dLdBf_f\"),\n", " (grads[\"cell_fwd\"][\"bc\"].T, \"dLdBc_f\"),\n", " (grads[\"cell_fwd\"][\"Wo\"], \"dLdWo_f\"),\n", " (grads[\"cell_fwd\"][\"Wu\"], \"dLdWu_f\"),\n", " (grads[\"cell_fwd\"][\"Wf\"], \"dLdWf_f\"),\n", " (grads[\"cell_fwd\"][\"Wc\"], \"dLdWc_f\"),\n", " (grads[\"cell_bwd\"][\"bo\"].T, \"dLdBo_b\"),\n", " (grads[\"cell_bwd\"][\"bu\"].T, \"dLdBu_b\"),\n", " (grads[\"cell_bwd\"][\"bf\"].T, \"dLdBf_b\"),\n", " (grads[\"cell_bwd\"][\"bc\"].T, \"dLdBc_b\"),\n", " (grads[\"cell_bwd\"][\"Wo\"], \"dLdWo_b\"),\n", " (grads[\"cell_bwd\"][\"Wu\"], \"dLdWu_b\"),\n", " (grads[\"cell_bwd\"][\"Wf\"], \"dLdWf_b\"),\n", " (grads[\"cell_bwd\"][\"Wc\"], \"dLdWc_b\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Case {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " atol=1e-4,\n", " rtol=1e-4,\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Embedding.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "\n", "def test_Embedding(N=15):\n", " from numpy_ml.neural_nets.layers import Embedding\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " vocab_size = np.random.randint(1, 2000)\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " emb_dim = np.random.randint(1, 100)\n", "\n", " X = np.random.randint(0, vocab_size, (n_ex, n_in))\n", "\n", " # initialize Embedding layer\n", " L1 = Embedding(n_out=emb_dim, vocab_size=vocab_size)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " # dLdX = L1.backward(dLdy)\n", " L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchEmbeddingLayer(vocab_size, emb_dim, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (dLdy, \"dLdy\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " # (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_BallTree.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def test_ball_tree(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " k = np.random.randint(1, N)\n", " ls = np.min([np.random.randint(1, 10), N - 1])\n", "\n", " X = np.random.rand(N, M)\n", " BT = BallTree(leaf_size=ls, metric=euclidean)\n", " BT.fit(X)\n", "\n", " x = np.random.rand(M)\n", " mine = BT.nearest_neighbors(k, x)\n", " assert len(mine) == k\n", "\n", " mine_neighb = np.array([n.key for n in mine])\n", " mine_dist = np.array([n.distance for n in mine])\n", "\n", " sort_ix = np.argsort(mine_dist)\n", " mine_dist = mine_dist[sort_ix]\n", " mine_neighb = mine_neighb[sort_ix]\n", "\n", " sk = sk_BallTree(X, leaf_size=ls)\n", " theirs_dist, ind = sk.query(x.reshape(1, -1), k=k)\n", " sort_ix = np.argsort(theirs_dist.flatten())\n", "\n", " theirs_dist = theirs_dist.flatten()[sort_ix]\n", " theirs_neighb = X[ind.flatten()[sort_ix]]\n", "\n", " for j in range(len(theirs_dist)):\n", " np.testing.assert_almost_equal(mine_neighb[j], theirs_neighb[j])\n", " np.testing.assert_almost_equal(mine_dist[j], theirs_dist[j])\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_RNNCell.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "\n", "def test_RNNCell(N=15):\n", " from numpy_ml.neural_nets.layers import RNNCell\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 10)\n", " n_t = np.random.randint(1, 10)\n", " X = random_tensor((n_ex, n_in, n_t), standardize=True)\n", "\n", " # initialize RNN layer\n", " L1 = RNNCell(n_out=n_out)\n", "\n", " # forward prop\n", " y_preds = []\n", " for t in range(n_t):\n", " y_pred = L1.forward(X[:, :, t])\n", " y_preds += [y_pred]\n", "\n", " # backprop\n", " dLdX = []\n", " dLdAt = np.ones_like(y_preds[t])\n", " for t in reversed(range(n_t)):\n", " dLdXt = L1.backward(dLdAt)\n", " dLdX.insert(0, dLdXt)\n", " dLdX = np.dstack(dLdX)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchRNNCell(n_in, n_out, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (X, \"X\"),\n", " (np.array(y_preds), \"y\"),\n", " (L1.parameters[\"ba\"].T, \"ba\"),\n", " (L1.parameters[\"bx\"].T, \"bx\"),\n", " (L1.parameters[\"Wax\"].T, \"Wax\"),\n", " (L1.parameters[\"Waa\"].T, \"Waa\"),\n", " (L1.gradients[\"ba\"].T, \"dLdBa\"),\n", " (L1.gradients[\"bx\"].T, \"dLdBx\"),\n", " (L1.gradients[\"Wax\"].T, \"dLdWax\"),\n", " (L1.gradients[\"Waa\"].T, \"dLdWaa\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " atol=1e-3,\n", " rtol=1e-3,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_ELU.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "\n", "def test_elu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import ELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 10)\n", " z = random_tensor((1, n_dims))\n", "\n", " alpha = np.random.uniform(0, 10)\n", "\n", " mine = ELU(alpha)\n", " gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()\n", "\n", " assert_almost_equal(mine.fn(z), gold(z, alpha))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_elu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import ELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " n_dims = np.random.randint(1, 10)\n", " alpha = np.random.uniform(0, 10)\n", " z = random_tensor((n_ex, n_dims))\n", "\n", " mine = ELU(alpha)\n", " gold = torch_gradient_generator(F.elu, alpha=alpha)\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=6)\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_SoftPlus.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_softplus_activation(N=50):\n", " from numpy_ml.neural_nets.activations import SoftPlus\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SoftPlus()\n", " gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "def test_softplus_grad(N=50):\n", " from numpy_ml.neural_nets.activations import SoftPlus\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SoftPlus()\n", " gold = torch_gradient_generator(F.softplus)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims), standardize=True)\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_ReLU.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_relu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import ReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = ReLU()\n", " gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_relu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import ReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = ReLU()\n", " gold = torch_gradient_generator(F.relu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_AdditiveNGram.py", "content": ["# flake8: noqa\n", "import tempfile\n", "\n", "import nltk\n", "import numpy as np\n", "\n", "from ..preprocessing.nlp import tokenize_words\n", "from ..ngram import AdditiveNGram, MLENGram\n", "from ..utils.testing import random_paragraph\n", "\n", "\n", "class MLEGold:\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True\n", " ):\n", " self.N = N\n", " self.K = K\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"K\": K,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " N = self.N\n", " H = self.hyperparameters\n", " models, counts = {}, {}\n", " grams = {n: [] for n in range(1, N + 1)}\n", " gg = {n: [] for n in range(1, N + 1)}\n", " filter_punc, filter_stop = H[\"filter_punctuation\"], H[\"filter_stopwords\"]\n", "\n", " n_words = 0\n", " tokens = set([])\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " words = tokenize_words(line, filter_punc, filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for n in range(1, N + 1):\n", " grams[n].append(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", "\n", " gg[n].extend(\n", " list(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", " )\n", "\n", " for n in range(1, N + 1):\n", " counts[n] = nltk.FreqDist(gg[n])\n", " models[n] = nltk.lm.MLE(order=n)\n", " models[n].fit(grams[n], tokens)\n", "\n", " self.counts = counts\n", " self.n_words = n_words\n", " self._models = models\n", " self.n_tokens = len(vocab) if vocab is not None else len(tokens)\n", "\n", " def log_prob(self, words, N):\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in nltk.ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _log_ngram_prob(self, ngram):\n", " N = len(ngram)\n", " return self._models[N].logscore(ngram[-1], ngram[:-1])\n", "\n", "\n", "class AdditiveGold:\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True\n", " ):\n", " self.N = N\n", " self.K = K\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"K\": K,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " N = self.N\n", " H = self.hyperparameters\n", " models, counts = {}, {}\n", " grams = {n: [] for n in range(1, N + 1)}\n", " gg = {n: [] for n in range(1, N + 1)}\n", " filter_punc, filter_stop = H[\"filter_punctuation\"], H[\"filter_stopwords\"]\n", "\n", " n_words = 0\n", " tokens = set()\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " words = tokenize_words(line, filter_punc, filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for n in range(1, N + 1):\n", " grams[n].append(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", "\n", " gg[n].extend(\n", " list(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", " )\n", "\n", " for n in range(1, N + 1):\n", " counts[n] = nltk.FreqDist(gg[n])\n", " models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)\n", " models[n].fit(grams[n], tokens)\n", "\n", " self.counts = counts\n", " self._models = models\n", " self.n_words = n_words\n", " self.n_tokens = len(vocab) if vocab is not None else len(tokens)\n", "\n", " def log_prob(self, words, N):\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in nltk.ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _log_ngram_prob(self, ngram):\n", " N = len(ngram)\n", " return self._models[N].logscore(ngram[-1], ngram[:-1])\n", "\n", "def test_additive():\n", " K = np.random.rand()\n", " N = np.random.randint(2, 5)\n", " gold = AdditiveGold(\n", " N, K, unk=True, filter_stopwords=False, filter_punctuation=False\n", " )\n", " mine = AdditiveNGram(\n", " N, K, unk=True, filter_stopwords=False, filter_punctuation=False\n", " )\n", "\n", " with tempfile.NamedTemporaryFile() as temp:\n", " temp.write(bytes(\" \".join(random_paragraph(1000)), encoding=\"utf-8-sig\"))\n", " gold.train(temp.name, encoding=\"utf-8-sig\")\n", " mine.train(temp.name, encoding=\"utf-8-sig\")\n", "\n", " for k in mine.counts[N].keys():\n", " if k[0] == k[1] and k[0] in (\"\", \"\"):\n", " continue\n", "\n", " err_str = \"{}, mine: {}, gold: {}\"\n", " assert mine.counts[N][k] == gold.counts[N][k], err_str.format(\n", " k, mine.counts[N][k], gold.counts[N][k]\n", " )\n", "\n", " M = mine.log_prob(k, N)\n", " G = gold.log_prob(k, N) / np.log2(np.e) # convert to log base e\n", " np.testing.assert_allclose(M, G)\n", " print(\"PASSED\")\n"]} {"path": "numpy_ml/tests/test_DotProductAttention.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "def test_DPAttention(N=15):\n", " from numpy_ml.neural_nets.layers import DotProductAttention\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " d_k = np.random.randint(1, 100)\n", " d_v = np.random.randint(1, 100)\n", "\n", " Q = random_tensor((n_ex, d_k), standardize=True)\n", " K = random_tensor((n_ex, d_k), standardize=True)\n", " V = random_tensor((n_ex, d_v), standardize=True)\n", "\n", " # initialize DotProductAttention layer\n", " mine = DotProductAttention(scale=True, dropout_p=0)\n", "\n", " # forward prop\n", " y_pred = mine.forward(Q, K, V)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdQ, dLdK, dLdV = mine.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchSDPAttentionLayer()\n", " golds = gold_mod.extract_grads(Q, K, V)\n", "\n", " params = [\n", " (mine.X[0][0], \"Q\"),\n", " (mine.X[0][1], \"K\"),\n", " (mine.X[0][2], \"V\"),\n", " (y_pred, \"Y\"),\n", " (dLdV, \"dLdV\"),\n", " (dLdK, \"dLdK\"),\n", " (dLdQ, \"dLdQ\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"n_ex={} d_k={} d_v={}\".format(n_ex, d_k, d_v))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_SquaredError.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Loss Functions #\n", "#######################################################################\n", "\n", "\n", "def test_squared_error(N=15):\n", " from numpy_ml.neural_nets.losses import SquaredError\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SquaredError()\n", " gold = (\n", " lambda y, y_pred: mean_squared_error(y, y_pred)\n", " * y_pred.shape[0]\n", " * y_pred.shape[1]\n", " * 0.5\n", " )\n", "\n", " # ensure we get 0 when the two arrays are equal\n", " n_dims = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = y_pred = random_tensor((n_examples, n_dims))\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))\n", " print(\"PASSED\")\n", "\n", " i = 1\n", " while i < N:\n", " n_dims = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = random_tensor((n_examples, n_dims))\n", " y_pred = random_tensor((n_examples, n_dims))\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred), decimal=5)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "def test_squared_error_grad(N=15):\n", " from numpy_ml.neural_nets.losses import SquaredError\n", " from numpy_ml.neural_nets.activations import Tanh\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SquaredError()\n", " gold = torch_mse_grad\n", " act = Tanh()\n", "\n", " i = 1\n", " while i < N:\n", " n_dims = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = random_tensor((n_examples, n_dims))\n", "\n", " # raw inputs\n", " z = random_tensor((n_examples, n_dims))\n", " y_pred = act.fn(z)\n", "\n", " assert_almost_equal(\n", " mine.grad(y, y_pred, z, act), 0.5 * gold(y, z, torch.tanh), decimal=4\n", " )\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_SkipConnectionConvModule.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):\n", " \"\"\"\n", " Manual gradient calc for vanilla RNN parameters\n", " \"\"\"\n", " if param_name in [\"Ba\", \"Bx\"]:\n", " param_name = param_name.lower()\n", " elif param_name in [\"X\", \"y\"]:\n", " return None\n", "\n", " param_orig = model.parameters[param_name]\n", " model.flush_gradients()\n", " grads = np.zeros_like(param_orig)\n", "\n", " for flat_ix, val in enumerate(param_orig.flat):\n", " param = deepcopy(param_orig)\n", " md_ix = np.unravel_index(flat_ix, param.shape)\n", "\n", " # plus\n", " y_preds_plus = []\n", " param[md_ix] = val + epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_plus = model.forward(X[:, :, t])\n", " y_preds_plus += [y_pred_plus]\n", " loss_plus = loss_func(y_preds_plus)\n", " model.flush_gradients()\n", "\n", " # minus\n", " y_preds_minus = []\n", " param[md_ix] = val - epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_minus = model.forward(X[:, :, t])\n", " y_preds_minus += [y_pred_minus]\n", " loss_minus = loss_func(y_preds_minus)\n", " model.flush_gradients()\n", "\n", " grad = (loss_plus - loss_minus) / (2 * epsilon)\n", " grads[md_ix] = grad\n", " return grads.T\n", "\n", "\n", "#######################################################################\n", "# Modules #\n", "#######################################################################\n", "\n", "\n", "\n", "\n", "def test_SkipConnectionConvModule(N=15):\n", " from numpy_ml.neural_nets.modules import SkipConnectionConvModule\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(2, 10)\n", " in_cols = np.random.randint(2, 10)\n", " n_in = np.random.randint(2, 5)\n", " n_out1 = np.random.randint(2, 5)\n", " n_out2 = np.random.randint(2, 5)\n", " f_shape1 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " f_shape2 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " f_shape_skip = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", "\n", " s1 = np.random.randint(1, 5)\n", " s2 = np.random.randint(1, 5)\n", " s_skip = np.random.randint(1, 5)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " p1 = (np.random.randint(1, 5), np.random.randint(1, 5))\n", " p2 = (np.random.randint(1, 5), np.random.randint(1, 5))\n", "\n", " # initialize SkipConnectionConv module\n", " L1 = SkipConnectionConvModule(\n", " out_ch1=n_out1,\n", " out_ch2=n_out2,\n", " kernel_shape1=f_shape1,\n", " kernel_shape2=f_shape2,\n", " kernel_shape_skip=f_shape_skip,\n", " stride1=s1,\n", " stride2=s2,\n", " stride_skip=s_skip,\n", " pad1=p1,\n", " pad2=p2,\n", " act_fn=act_fn,\n", " epsilon=1e-5,\n", " momentum=0.9,\n", " )\n", "\n", " # forward prop\n", " try:\n", " y_pred = L1.forward(X)\n", " except (ValueError, AssertionError):\n", " print(\"Invalid padding; Retrying\")\n", " continue\n", "\n", " ps = L1.hyperparameters[\"pad_skip\"]\n", " if ps[0] != ps[1] or ps[2] != ps[3]:\n", " continue\n", " pad_skip = (ps[0], ps[2])\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchSkipConnectionConv(\n", " torch_fn,\n", " p1,\n", " p2,\n", " pad_skip,\n", " L1.parameters,\n", " L1.hyperparameters,\n", " momentum=L1.momentum,\n", " epsilon=L1.epsilon,\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = L1.parameters[\"components\"]\n", " grads = L1.gradients[\"components\"]\n", " params = [\n", " (X, \"X\"),\n", " (params[\"conv1\"][\"W\"], \"conv1_W\"),\n", " (params[\"conv1\"][\"b\"], \"conv1_b\"),\n", " (params[\"batchnorm1\"][\"scaler\"].T, \"bn1_scaler\"),\n", " (params[\"batchnorm1\"][\"intercept\"], \"bn1_intercept\"),\n", " (params[\"batchnorm1\"][\"running_mean\"], \"bn1_running_mean\"),\n", " # (params[\"batchnorm1\"][\"running_var\"], \"bn1_running_var\"),\n", " (params[\"conv2\"][\"W\"], \"conv2_W\"),\n", " (params[\"conv2\"][\"b\"], \"conv2_b\"),\n", " (params[\"batchnorm2\"][\"scaler\"].T, \"bn2_scaler\"),\n", " (params[\"batchnorm2\"][\"intercept\"], \"bn2_intercept\"),\n", " (params[\"batchnorm2\"][\"running_mean\"], \"bn2_running_mean\"),\n", " # (params[\"batchnorm2\"][\"running_var\"], \"bn2_running_var\"),\n", " (params[\"conv_skip\"][\"W\"], \"conv_skip_W\"),\n", " (params[\"conv_skip\"][\"b\"], \"conv_skip_b\"),\n", " (params[\"batchnorm_skip\"][\"scaler\"].T, \"bn_skip_scaler\"),\n", " (params[\"batchnorm_skip\"][\"intercept\"], \"bn_skip_intercept\"),\n", " (params[\"batchnorm_skip\"][\"running_mean\"], \"bn_skip_running_mean\"),\n", " # (params[\"batchnorm_skip\"][\"running_var\"], \"bn_skip_running_var\"),\n", " (L1._dv[\"conv1_out\"], \"act1_out\"),\n", " (L1._dv[\"batchnorm1_out\"], \"bn1_out\"),\n", " (L1._dv[\"conv2_out\"], \"conv2_out\"),\n", " (L1._dv[\"batchnorm2_out\"], \"bn2_out\"),\n", " (L1._dv[\"conv_skip_out\"], \"conv_skip_out\"),\n", " (L1._dv[\"batchnorm_skip_out\"], \"bn_skip_out\"),\n", " (y_pred, \"Y\"),\n", " (dLdy, \"dLdY\"),\n", " (L1.derived_variables[\"dLdBn2\"], \"dLdBn2_out\"),\n", " (L1.derived_variables[\"dLdConv2\"], \"dLdConv2_out\"),\n", " (L1.derived_variables[\"dLdBnSkip\"], \"dLdBnSkip_out\"),\n", " (L1.derived_variables[\"dLdConvSkip\"], \"dLdConvSkip_out\"),\n", " (L1.derived_variables[\"dLdBn1\"], \"dLdBn1_out\"),\n", " (L1.derived_variables[\"dLdConv1\"], \"dLdActFn1_out\"),\n", " (dLdX, \"dLdX\"),\n", " (grads[\"batchnorm_skip\"][\"scaler\"].T, \"dLdBnSkip_scaler\"),\n", " (grads[\"batchnorm_skip\"][\"intercept\"], \"dLdBnSkip_intercept\"),\n", " (grads[\"conv_skip\"][\"W\"], \"dLdConvSkip_W\"),\n", " (grads[\"conv_skip\"][\"b\"], \"dLdConvSkip_b\"),\n", " (grads[\"batchnorm2\"][\"scaler\"].T, \"dLdBn2_scaler\"),\n", " (grads[\"batchnorm2\"][\"intercept\"], \"dLdBn2_intercept\"),\n", " (grads[\"conv2\"][\"W\"], \"dLdConv2_W\"),\n", " (grads[\"conv2\"][\"b\"], \"dLdConv2_b\"),\n", " (grads[\"batchnorm1\"][\"scaler\"].T, \"dLdBn1_scaler\"),\n", " (grads[\"batchnorm1\"][\"intercept\"], \"dLdBn1_intercept\"),\n", " (grads[\"conv1\"][\"W\"], \"dLdConv1_W\"),\n", " (grads[\"conv1\"][\"b\"], \"dLdConv1_b\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"act_fn={}, n_ex={}\".format(act_fn, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"pad1={}, stride1={}, f_shape1={}\".format(p1, s1, f_shape1))\n", " print(\"pad2={}, stride2={}, f_shape2={}\".format(p2, s2, f_shape2))\n", " print(\"stride_skip={}, f_shape_skip={}\".format(s_skip, f_shape_skip))\n", " warn_str = (\n", " \"\\n[NOTE] The tests in this module can fail sometimes during \"\n", " \"backprop due to the ReLU issue: while the difference in the forward pass \"\n", " \"between z=-1e-9 and z=1e-9 is miniscule, the difference during the backward \"\n", " \"pass is significant due to ReLU's kink about 0.\"\n", " )\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix, warn_str),\n", " decimal=2,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_HuffmanEncoder.py", "content": ["# flake8: noqa\n", "from collections import Counter\n", "\n", "# gold-standard imports\n", "import huffman\n", "import numpy as np\n", "\n", "from scipy.fftpack import dct\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# try:\n", "# from librosa.core.time_frequency import fft_frequencies\n", "# except ImportError:\n", "# # for librosa >= 0.8.0\n", "# from librosa import fft_frequencies\n", "# from librosa.feature import mfcc as lr_mfcc\n", "# from librosa.util import frame\n", "# from librosa.filters import mel\n", "\n", "# numpy-ml implementations\n", "from numpy_ml.preprocessing.general import Standardizer\n", "from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder\n", "from numpy_ml.preprocessing.dsp import (\n", " DCT,\n", " DFT,\n", " mfcc,\n", " to_frames,\n", " mel_filterbank,\n", " dft_bins,\n", ")\n", "from numpy_ml.utils.testing import random_paragraph\n", "\n", "\n", "def test_huffman(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " n_words = np.random.randint(1, 100)\n", " para = random_paragraph(n_words)\n", " HT = HuffmanEncoder()\n", " HT.fit(para)\n", " my_dict = HT._item2code\n", " their_dict = huffman.codebook(Counter(para).items())\n", "\n", " for k, v in their_dict.items():\n", " fstr = \"their_dict['{}'] = {}, but my_dict['{}'] = {}\"\n", " assert k in my_dict, \"key `{}` not in my_dict\".format(k)\n", " assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k])\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_KNN.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier\n", "from sklearn.gaussian_process import GaussianProcessRegressor\n", "\n", "from numpy_ml.nonparametric.knn import KNN\n", "from numpy_ml.nonparametric.gp import GPRegression\n", "from numpy_ml.utils.distance_metrics import euclidean\n", "\n", "\n", "def test_knn_regression(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " k = np.random.randint(1, N)\n", " ls = np.min([np.random.randint(1, 10), N - 1])\n", " weights = np.random.choice([\"uniform\", \"distance\"])\n", "\n", " X = np.random.rand(N, M)\n", " X_test = np.random.rand(N, M)\n", " y = np.random.rand(N)\n", "\n", " knn = KNN(\n", " k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights\n", " )\n", " knn.fit(X, y)\n", " preds = knn.predict(X_test)\n", "\n", " gold = KNeighborsRegressor(\n", " p=2,\n", " leaf_size=ls,\n", " n_neighbors=k,\n", " weights=weights,\n", " metric=\"minkowski\",\n", " algorithm=\"ball_tree\",\n", " )\n", " gold.fit(X, y)\n", " gold_preds = gold.predict(X_test)\n", "\n", " for mine, theirs in zip(preds, gold_preds):\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_knn_clf(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " k = np.random.randint(1, N)\n", " n_classes = np.random.randint(2, 10)\n", " ls = np.min([np.random.randint(1, 10), N - 1])\n", " weights = \"uniform\"\n", "\n", " X = np.random.rand(N, M)\n", " X_test = np.random.rand(N, M)\n", " y = np.random.randint(0, n_classes, size=N)\n", "\n", " knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights)\n", " knn.fit(X, y)\n", " preds = knn.predict(X_test)\n", "\n", " gold = KNeighborsClassifier(\n", " p=2,\n", " metric=\"minkowski\",\n", " leaf_size=ls,\n", " n_neighbors=k,\n", " weights=weights,\n", " algorithm=\"ball_tree\",\n", " )\n", " gold.fit(X, y)\n", " gold_preds = gold.predict(X_test)\n", "\n", " for mine, theirs in zip(preds, gold_preds):\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_RandomForest.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn.datasets import make_regression, make_blobs\n", "from sklearn.model_selection import train_test_split\n", "\n", "from numpy_ml.trees.gbdt import GradientBoostedDecisionTree\n", "from numpy_ml.trees.dt import DecisionTree, Node, Leaf\n", "from numpy_ml.trees.rf import RandomForest\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def clone_tree(dtree):\n", " children_left = dtree.tree_.children_left\n", " children_right = dtree.tree_.children_right\n", " feature = dtree.tree_.feature\n", " threshold = dtree.tree_.threshold\n", " values = dtree.tree_.value\n", "\n", " def grow(node_id):\n", " l, r = children_left[node_id], children_right[node_id]\n", " if l == r:\n", " return Leaf(values[node_id].argmax())\n", " n = Node(None, None, (feature[node_id], threshold[node_id]))\n", " n.left = grow(l)\n", " n.right = grow(r)\n", " return n\n", "\n", " node_id = 0\n", " root = Node(None, None, (feature[node_id], threshold[node_id]))\n", " root.left = grow(children_left[node_id])\n", " root.right = grow(children_right[node_id])\n", " return root\n", "\n", "\n", "def compare_trees(mine, gold):\n", " clone = clone_tree(gold)\n", " mine = mine.root\n", "\n", " def test(mine, clone):\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature, \"Node {} not equal\".format(depth)\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", " depth = 0\n", " ok = True\n", " while ok:\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", "def test_RandomForest(N=1):\n", " np.random.seed(12345)\n", " i = 1\n", " while i <= N:\n", " n_ex = np.random.randint(2, 100)\n", " n_feats = np.random.randint(2, 100)\n", " n_trees = np.random.randint(2, 100)\n", " max_depth = np.random.randint(1, 5)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i\n", " )\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return 1 - accuracy_score(yp, y)\n", "\n", " # initialize model\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = RandomForest(\n", " classifier=classifier,\n", " n_feats=n_feats,\n", " n_trees=n_trees,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " )\n", " gold = RandomForestClassifier(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = RandomForest(\n", " criterion=criterion,\n", " n_feats=n_feats,\n", " n_trees=n_trees,\n", " max_depth=max_depth,\n", " classifier=classifier,\n", " )\n", " gold = RandomForestRegressor(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", "\n", " print(\"Trial {}\".format(i))\n", " print(\"\\tClassifier={}, criterion={}\".format(classifier, criterion))\n", " print(\"\\tmax_depth={}, n_feats={}, n_ex={}\".format(max_depth, n_feats, n_ex))\n", " if classifier:\n", " print(\"\\tn_classes: {}\".format(n_classes))\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " gold.fit(X, Y)\n", "\n", " # get preds\n", " y_pred_mine = mine.predict(X)\n", " y_pred_gold = gold.predict(X)\n", "\n", " loss_mine = loss(y_pred_mine, Y)\n", " loss_gold = loss(y_pred_gold, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_gold_test = gold.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_gold_test = loss(y_pred_gold_test, Y_test)\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine, loss_gold)\n", " print(\"\\tLoss on training: {}\".format(loss_mine))\n", " except AssertionError as e:\n", " print(\"\\tTraining losses not equal:\\n{}\".format(e))\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)\n", " print(\"\\tLoss on test: {}\".format(loss_mine_test))\n", " except AssertionError as e:\n", " print(\"\\tTest losses not equal:\\n{}\".format(e))\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_PolynomialKernel.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "#######################################################################\n", "# Kernels #\n", "#######################################################################\n", "\n", "\n", "\n", "def test_polynomial_kernel(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " M = np.random.randint(1, 100)\n", " C = np.random.randint(1, 1000)\n", " gamma = np.random.rand()\n", " d = np.random.randint(1, 5)\n", " c0 = np.random.rand()\n", "\n", " X = np.random.rand(N, C)\n", " Y = np.random.rand(M, C)\n", "\n", " mine = PolynomialKernel(gamma=gamma, d=d, c0=c0)(X, Y)\n", " gold = sk_poly(X, Y, gamma=gamma, degree=d, coef0=c0)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_SELU.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_selu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import SELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SELU()\n", " gold = lambda z: F.selu(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "\n", "def test_selu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import SELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SELU()\n", " gold = torch_gradient_generator(F.selu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=6)\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_Softmax.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "def test_softmax_activation(N=15):\n", " from numpy_ml.neural_nets.layers import Softmax\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Softmax()\n", " gold = lambda z: F.softmax(torch.FloatTensor(z), dim=1).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.forward(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "def test_softmax_grad(N=15):\n", " from numpy_ml.neural_nets.layers import Softmax\n", " from functools import partial\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", " p_soft = partial(F.softmax, dim=1)\n", " gold = torch_gradient_generator(p_soft)\n", "\n", " i = 0\n", " while i < N:\n", " mine = Softmax()\n", " n_ex = np.random.randint(1, 3)\n", " n_dims = np.random.randint(1, 50)\n", " z = random_tensor((n_ex, n_dims), standardize=True)\n", " out = mine.forward(z)\n", "\n", " assert_almost_equal(\n", " gold(z),\n", " mine.backward(np.ones_like(out)),\n", " err_msg=\"Theirs:\\n{}\\n\\nMine:\\n{}\\n\".format(\n", " gold(z), mine.backward(np.ones_like(out))\n", " ),\n", " decimal=3,\n", " )\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_LeakyReLU.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "def test_leakyrelu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import LeakyReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " alpha = np.random.uniform(0, 10)\n", "\n", " mine = LeakyReLU(alpha=alpha)\n", " gold = lambda z: F.leaky_relu(torch.FloatTensor(z), alpha).numpy()\n", " assert_almost_equal(mine.fn(z), gold(z))\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "def test_leakyrelu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import LeakyReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " n_dims = np.random.randint(1, 10)\n", " alpha = np.random.uniform(0, 10)\n", " z = random_tensor((n_ex, n_dims))\n", "\n", " mine = LeakyReLU(alpha)\n", " gold = torch_gradient_generator(F.leaky_relu, negative_slope=alpha)\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=6)\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/nn_torch_models.py", "content": ["# flake8: noqa\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "# import tensorflow as tf\n", "\n", "import numpy as np\n", "\n", "#######################################################################\n", "# Gold-standard implementations for testing custom layers #\n", "# (Requires Pytorch) #\n", "#######################################################################\n", "\n", "\n", "def torchify(var, requires_grad=True):\n", " return torch.autograd.Variable(torch.FloatTensor(var), requires_grad=requires_grad)\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "def torch_xe_grad(y, z):\n", " z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)\n", " y = torch.LongTensor(y.argmax(axis=1))\n", " loss = F.cross_entropy(z, y, reduction=\"sum\")\n", " loss.backward()\n", " grad = z.grad.numpy()\n", " return grad\n", "\n", "\n", "def torch_mse_grad(y, z, act_fn):\n", " y = torch.FloatTensor(y)\n", " z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)\n", " y_pred = act_fn(z)\n", " loss = F.mse_loss(y_pred, y, reduction=\"sum\") # size_average=False).sum()\n", " loss.backward()\n", " grad = z.grad.numpy()\n", " return grad\n", "\n", "\n", "class TorchVAELoss(nn.Module):\n", " def __init__(self):\n", " super(TorchVAELoss, self).__init__()\n", "\n", " def extract_grads(self, X, X_recon, t_mean, t_log_var):\n", " eps = np.finfo(float).eps\n", " X = torchify(X, requires_grad=False)\n", " X_recon = torchify(np.clip(X_recon, eps, 1 - eps))\n", " t_mean = torchify(t_mean)\n", " t_log_var = torchify(t_log_var)\n", "\n", " BCE = torch.sum(F.binary_cross_entropy(X_recon, X, reduction=\"none\"), dim=1)\n", "\n", " # see Appendix B from VAE paper:\n", " # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014\n", " # https://arxiv.org/abs/1312.6114\n", " # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)\n", " KLD = -0.5 * torch.sum(1 + t_log_var - t_mean.pow(2) - t_log_var.exp(), dim=1)\n", "\n", " loss = torch.mean(BCE + KLD)\n", " loss.backward()\n", "\n", " grads = {\n", " \"loss\": loss.detach().numpy(),\n", " \"dX_recon\": X_recon.grad.numpy(),\n", " \"dt_mean\": t_mean.grad.numpy(),\n", " \"dt_log_var\": t_log_var.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchWGANGPLoss(nn.Module):\n", " def __init__(self, lambda_=10):\n", " self.lambda_ = torchify([lambda_])\n", " super(TorchWGANGPLoss, self).__init__()\n", "\n", " def forward(self, Y_real, Y_fake, gradInterp):\n", " GY_fake = Y_fake.copy()\n", " self.Y_real = torchify(Y_real)\n", " self.Y_fake = torchify(Y_fake)\n", " self.GY_fake = torchify(GY_fake)\n", " self.gradInterp = torchify(gradInterp)\n", "\n", " # calc grad penalty\n", " norm = self.gradInterp.norm(2, dim=1)\n", " self.norm1 = torch.sqrt(torch.sum(self.gradInterp.pow(2), dim=1))\n", " assert torch.allclose(norm, self.norm1)\n", "\n", " self.gpenalty = self.lambda_ * ((self.norm1 - 1).pow(2)).mean()\n", " self.C_loss = self.Y_fake.mean() - self.Y_real.mean() + self.gpenalty\n", " self.G_loss = -self.GY_fake.mean()\n", "\n", " def extract_grads(self, Y_real, Y_fake, gradInterp):\n", " self.forward(Y_real, Y_fake, gradInterp)\n", "\n", " self.C_loss.backward()\n", " self.G_loss.backward()\n", "\n", " grads = {\n", " \"Y_real\": self.Y_real.detach().numpy(),\n", " \"Y_fake\": self.Y_fake.detach().numpy(),\n", " \"gradInterp\": self.gradInterp.detach().numpy(),\n", " \"GP\": self.gpenalty.detach().numpy(),\n", " \"C_loss\": self.C_loss.detach().numpy(),\n", " \"G_loss\": self.G_loss.detach().numpy(),\n", " \"C_dY_real\": self.Y_real.grad.numpy(),\n", " \"C_dGradInterp\": self.gradInterp.grad.numpy(),\n", " \"C_dY_fake\": self.Y_fake.grad.numpy(),\n", " \"G_dY_fake\": self.GY_fake.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchLinearActivation(nn.Module):\n", " def __init__(self):\n", " super(TorchLinearActivation, self).__init__()\n", " pass\n", "\n", " @staticmethod\n", " def forward(input):\n", " return input\n", "\n", " @staticmethod\n", " def backward(grad_output):\n", " return torch.ones_like(grad_output)\n", "\n", "\n", "class TorchBatchNormLayer(nn.Module):\n", " def __init__(self, n_in, params, mode, momentum=0.9, epsilon=1e-5):\n", " super(TorchBatchNormLayer, self).__init__()\n", "\n", " scaler = params[\"scaler\"]\n", " intercept = params[\"intercept\"]\n", "\n", " if mode == \"1D\":\n", " self.layer1 = nn.BatchNorm1d(\n", " num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True\n", " )\n", " elif mode == \"2D\":\n", " self.layer1 = nn.BatchNorm2d(\n", " num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True\n", " )\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " if X.ndim == 4:\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", "\n", " if not isinstance(X, torch.Tensor):\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.Y = self.layer1(self.X)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X, Y_true=None):\n", " self.forward(X)\n", "\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])\n", " self.loss1 = (\n", " 0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()\n", " )\n", " else:\n", " self.loss1 = self.Y.sum()\n", "\n", " self.loss1.backward()\n", "\n", " X_np = self.X.detach().numpy()\n", " Y_np = self.Y.detach().numpy()\n", " dX_np = self.X.grad.numpy()\n", " dY_np = self.Y.grad.numpy()\n", "\n", " if self.X.dim() == 4:\n", " orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, orig, X_swap)\n", " X_np = np.moveaxis(X_np, orig, X_swap)\n", " Y_np = np.moveaxis(Y_np, orig, X_swap)\n", " dX_np = np.moveaxis(dX_np, orig, X_swap)\n", " dY_np = np.moveaxis(dY_np, orig, X_swap)\n", "\n", " grads = {\n", " \"loss\": self.loss1.detach().numpy(),\n", " \"X\": X_np,\n", " \"momentum\": 1 - self.layer1.momentum,\n", " \"epsilon\": self.layer1.eps,\n", " \"intercept\": self.layer1.bias.detach().numpy(),\n", " \"scaler\": self.layer1.weight.detach().numpy(),\n", " \"running_mean\": self.layer1.running_mean.detach().numpy(),\n", " \"running_var\": self.layer1.running_var.detach().numpy(),\n", " \"y\": Y_np,\n", " \"dLdy\": dY_np,\n", " \"dLdIntercept\": self.layer1.bias.grad.numpy(),\n", " \"dLdScaler\": self.layer1.weight.grad.numpy(),\n", " \"dLdX\": dX_np,\n", " }\n", " if isinstance(Y_true, np.ndarray):\n", " grads[\"Y_true\"] = Y_true\n", " return grads\n", "\n", "\n", "class TorchLayerNormLayer(nn.Module):\n", " def __init__(self, feat_dims, params, mode, epsilon=1e-5):\n", " super(TorchLayerNormLayer, self).__init__()\n", "\n", " self.layer1 = nn.LayerNorm(\n", " normalized_shape=feat_dims, eps=epsilon, elementwise_affine=True\n", " )\n", "\n", " scaler = params[\"scaler\"]\n", " intercept = params[\"intercept\"]\n", "\n", " if mode == \"2D\":\n", " scaler = np.moveaxis(scaler, [0, 1, 2], [-2, -1, -3])\n", " intercept = np.moveaxis(intercept, [0, 1, 2], [-2, -1, -3])\n", "\n", " assert scaler.shape == self.layer1.weight.shape\n", " assert intercept.shape == self.layer1.bias.shape\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " if X.ndim == 4:\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", "\n", " if not isinstance(X, torch.Tensor):\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.Y = self.layer1(self.X)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X, Y_true=None):\n", " self.forward(X)\n", "\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])\n", " self.loss1 = (\n", " 0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()\n", " )\n", " else:\n", " self.loss1 = self.Y.sum()\n", "\n", " self.loss1.backward()\n", "\n", " X_np = self.X.detach().numpy()\n", " Y_np = self.Y.detach().numpy()\n", " dX_np = self.X.grad.numpy()\n", " dY_np = self.Y.grad.numpy()\n", " intercept_np = self.layer1.bias.detach().numpy()\n", " scaler_np = self.layer1.weight.detach().numpy()\n", " dIntercept_np = self.layer1.bias.grad.numpy()\n", " dScaler_np = self.layer1.weight.grad.numpy()\n", "\n", " if self.X.dim() == 4:\n", " orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]\n", " orig_p, p_swap = [0, 1, 2], [-1, -3, -2]\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, orig, X_swap)\n", " X_np = np.moveaxis(X_np, orig, X_swap)\n", " Y_np = np.moveaxis(Y_np, orig, X_swap)\n", " dX_np = np.moveaxis(dX_np, orig, X_swap)\n", " dY_np = np.moveaxis(dY_np, orig, X_swap)\n", " scaler_np = np.moveaxis(scaler_np, orig_p, p_swap)\n", " intercept_np = np.moveaxis(intercept_np, orig_p, p_swap)\n", " dScaler_np = np.moveaxis(dScaler_np, orig_p, p_swap)\n", " dIntercept_np = np.moveaxis(dIntercept_np, orig_p, p_swap)\n", "\n", " grads = {\n", " \"loss\": self.loss1.detach().numpy(),\n", " \"X\": X_np,\n", " \"epsilon\": self.layer1.eps,\n", " \"intercept\": intercept_np,\n", " \"scaler\": scaler_np,\n", " \"y\": Y_np,\n", " \"dLdy\": dY_np,\n", " \"dLdIntercept\": dIntercept_np,\n", " \"dLdScaler\": dScaler_np,\n", " \"dLdX\": dX_np,\n", " }\n", " if isinstance(Y_true, np.ndarray):\n", " grads[\"Y_true\"] = Y_true\n", " return grads\n", "\n", "\n", "class TorchAddLayer(nn.Module):\n", " def __init__(self, act_fn, **kwargs):\n", " super(TorchAddLayer, self).__init__()\n", " self.act_fn = act_fn\n", "\n", " def forward(self, Xs):\n", " self.Xs = []\n", " x = Xs[0].copy()\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " self.sum = x.clone()\n", " x.retain_grad()\n", " self.Xs.append(x)\n", "\n", " for i in range(1, len(Xs)):\n", " x = Xs[i]\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " x.retain_grad()\n", " self.Xs.append(x)\n", " self.sum += x\n", "\n", " self.sum.retain_grad()\n", " self.Y = self.act_fn(self.sum)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", " grads = {\n", " \"Xs\": X,\n", " \"Sum\": self.sum.detach().numpy(),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dLdY\": self.Y.grad.numpy(),\n", " \"dLdSum\": self.sum.grad.numpy(),\n", " }\n", " grads.update(\n", " {\"dLdX{}\".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}\n", " )\n", " return grads\n", "\n", "\n", "class TorchMultiplyLayer(nn.Module):\n", " def __init__(self, act_fn, **kwargs):\n", " super(TorchMultiplyLayer, self).__init__()\n", " self.act_fn = act_fn\n", "\n", " def forward(self, Xs):\n", " self.Xs = []\n", " x = Xs[0].copy()\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " self.prod = x.clone()\n", " x.retain_grad()\n", " self.Xs.append(x)\n", "\n", " for i in range(1, len(Xs)):\n", " x = Xs[i]\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " x.retain_grad()\n", " self.Xs.append(x)\n", " self.prod *= x\n", "\n", " self.prod.retain_grad()\n", " self.Y = self.act_fn(self.prod)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", " grads = {\n", " \"Xs\": X,\n", " \"Prod\": self.prod.detach().numpy(),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dLdY\": self.Y.grad.numpy(),\n", " \"dLdProd\": self.prod.grad.numpy(),\n", " }\n", " grads.update(\n", " {\"dLdX{}\".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}\n", " )\n", " return grads\n", "\n", "\n", "class TorchSkipConnectionIdentity(nn.Module):\n", " def __init__(self, act_fn, pad1, pad2, params, hparams, momentum=0.9, epsilon=1e-5):\n", " super(TorchSkipConnectionIdentity, self).__init__()\n", "\n", " self.conv1 = nn.Conv2d(\n", " hparams[\"in_ch\"],\n", " hparams[\"out_ch\"],\n", " hparams[\"kernel_shape1\"],\n", " padding=pad1,\n", " stride=hparams[\"stride1\"],\n", " bias=True,\n", " )\n", "\n", " self.act_fn = act_fn\n", "\n", " self.batchnorm1 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " self.conv2 = nn.Conv2d(\n", " hparams[\"out_ch\"],\n", " hparams[\"out_ch\"],\n", " hparams[\"kernel_shape2\"],\n", " padding=pad2,\n", " stride=hparams[\"stride2\"],\n", " bias=True,\n", " )\n", "\n", " self.batchnorm2 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv1\"][\"W\"]\n", " b = params[\"components\"][\"conv1\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv1.weight.shape == W.shape\n", " assert self.conv1.bias.shape == b.flatten().shape\n", " self.conv1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm1\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm1\"][\"intercept\"]\n", " self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv2\"][\"W\"]\n", " b = params[\"components\"][\"conv2\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv2.weight.shape == W.shape\n", " assert self.conv2.bias.shape == b.flatten().shape\n", " self.conv2.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm2\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm2\"][\"intercept\"]\n", " self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " if not isinstance(X, torch.Tensor):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.X.retain_grad()\n", "\n", " self.conv1_out = self.conv1(self.X)\n", " self.conv1_out.retain_grad()\n", "\n", " self.act_fn1_out = self.act_fn(self.conv1_out)\n", " self.act_fn1_out.retain_grad()\n", "\n", " self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)\n", " self.batchnorm1_out.retain_grad()\n", "\n", " self.conv2_out = self.conv2(self.batchnorm1_out)\n", " self.conv2_out.retain_grad()\n", "\n", " self.batchnorm2_out = self.batchnorm2(self.conv2_out)\n", " self.batchnorm2_out.retain_grad()\n", "\n", " self.layer3_in = self.batchnorm2_out + self.X\n", " self.layer3_in.retain_grad()\n", "\n", " self.Y = self.act_fn(self.layer3_in)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]\n", " grads = {\n", " # layer parameters\n", " \"conv1_W\": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),\n", " \"conv1_b\": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn1_intercept\": self.batchnorm1.bias.detach().numpy(),\n", " \"bn1_scaler\": self.batchnorm1.weight.detach().numpy(),\n", " \"bn1_running_mean\": self.batchnorm1.running_mean.detach().numpy(),\n", " \"bn1_running_var\": self.batchnorm1.running_var.detach().numpy(),\n", " \"conv2_W\": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),\n", " \"conv2_b\": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn2_intercept\": self.batchnorm2.bias.detach().numpy(),\n", " \"bn2_scaler\": self.batchnorm2.weight.detach().numpy(),\n", " \"bn2_running_mean\": self.batchnorm2.running_mean.detach().numpy(),\n", " \"bn2_running_var\": self.batchnorm2.running_var.detach().numpy(),\n", " # layer inputs/outputs (forward step)\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"conv1_out\": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),\n", " \"act1_out\": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),\n", " \"bn1_out\": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),\n", " \"conv2_out\": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),\n", " \"bn2_out\": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),\n", " \"add_out\": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),\n", " \"Y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " # layer gradients (backward step)\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdAdd\": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),\n", " \"dLdBn2_out\": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv2_out\": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),\n", " \"dLdBn1_out\": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),\n", " \"dLdActFn1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " # layer parameter gradients (backward step)\n", " \"dLdBn2_intercept\": self.batchnorm2.bias.grad.numpy(),\n", " \"dLdBn2_scaler\": self.batchnorm2.weight.grad.numpy(),\n", " \"dLdConv2_W\": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv2_b\": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdBn1_intercept\": self.batchnorm1.bias.grad.numpy(),\n", " \"dLdBn1_scaler\": self.batchnorm1.weight.grad.numpy(),\n", " \"dLdConv1_W\": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv1_b\": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " }\n", " return grads\n", "\n", "\n", "class TorchCausalConv1d(torch.nn.Conv1d):\n", " \"\"\"https://github.com/pytorch/pytorch/issues/1333\n", "\n", " NB: this is only ensures that the convolution out length is the same as\n", " the input length IFF stride = 1. Otherwise, in/out lengths will differ.\n", " \"\"\"\n", "\n", " def __init__(\n", " self,\n", " in_channels,\n", " out_channels,\n", " kernel_size,\n", " stride=1,\n", " dilation=1,\n", " groups=1,\n", " bias=True,\n", " ):\n", " self.__padding = (kernel_size - 1) * dilation\n", "\n", " super(TorchCausalConv1d, self).__init__(\n", " in_channels,\n", " out_channels,\n", " kernel_size=kernel_size,\n", " stride=stride,\n", " padding=self.__padding,\n", " dilation=dilation,\n", " groups=groups,\n", " bias=bias,\n", " )\n", "\n", " def forward(self, input):\n", " result = super(TorchCausalConv1d, self).forward(input)\n", " if self.__padding != 0:\n", " return result[:, :, : -self.__padding]\n", " return result\n", "\n", "\n", "class TorchWavenetModule(nn.Module):\n", " def __init__(self, params, hparams, conv_1x1_pad):\n", " super(TorchWavenetModule, self).__init__()\n", " self.conv_dilation = TorchCausalConv1d(\n", " in_channels=hparams[\"components\"][\"conv_dilation\"][\"in_ch\"],\n", " out_channels=hparams[\"components\"][\"conv_dilation\"][\"out_ch\"],\n", " kernel_size=hparams[\"components\"][\"conv_dilation\"][\"kernel_width\"],\n", " stride=hparams[\"components\"][\"conv_dilation\"][\"stride\"],\n", " dilation=hparams[\"components\"][\"conv_dilation\"][\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " self.conv_1x1 = nn.Conv1d(\n", " in_channels=hparams[\"components\"][\"conv_1x1\"][\"in_ch\"],\n", " out_channels=hparams[\"components\"][\"conv_1x1\"][\"out_ch\"],\n", " kernel_size=hparams[\"components\"][\"conv_1x1\"][\"kernel_width\"],\n", " stride=hparams[\"components\"][\"conv_1x1\"][\"stride\"],\n", " padding=conv_1x1_pad,\n", " dilation=hparams[\"components\"][\"conv_1x1\"][\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " W = params[\"components\"][\"conv_dilation\"][\"W\"]\n", " b = params[\"components\"][\"conv_dilation\"][\"b\"]\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])\n", " self.conv_dilation.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv_dilation.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", " assert self.conv_dilation.weight.shape == W.shape\n", " assert self.conv_dilation.bias.shape == b.flatten().shape\n", "\n", " W = params[\"components\"][\"conv_1x1\"][\"W\"]\n", " b = params[\"components\"][\"conv_1x1\"][\"b\"]\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])\n", " self.conv_1x1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv_1x1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", " assert self.conv_1x1.weight.shape == W.shape\n", " assert self.conv_1x1.bias.shape == b.flatten().shape\n", "\n", " def forward(self, X_main, X_skip):\n", " # (N, W, C) -> (N, C, W)\n", " self.X_main = np.moveaxis(X_main, [0, 1, 2], [0, -1, -2])\n", " self.X_main = torchify(self.X_main)\n", " self.X_main.retain_grad()\n", "\n", " self.conv_dilation_out = self.conv_dilation(self.X_main)\n", " self.conv_dilation_out.retain_grad()\n", "\n", " self.tanh_out = torch.tanh(self.conv_dilation_out)\n", " self.sigm_out = torch.sigmoid(self.conv_dilation_out)\n", "\n", " self.tanh_out.retain_grad()\n", " self.sigm_out.retain_grad()\n", "\n", " self.multiply_gate_out = self.tanh_out * self.sigm_out\n", " self.multiply_gate_out.retain_grad()\n", "\n", " self.conv_1x1_out = self.conv_1x1(self.multiply_gate_out)\n", " self.conv_1x1_out.retain_grad()\n", "\n", " self.X_skip = torch.zeros_like(self.conv_1x1_out)\n", " if X_skip is not None:\n", " self.X_skip = torchify(np.moveaxis(X_skip, [0, 1, 2], [0, -1, -2]))\n", " self.X_skip.retain_grad()\n", "\n", " self.Y_skip = self.X_skip + self.conv_1x1_out\n", " self.Y_main = self.X_main + self.conv_1x1_out\n", "\n", " self.Y_skip.retain_grad()\n", " self.Y_main.retain_grad()\n", "\n", " def extract_grads(self, X_main, X_skip):\n", " self.forward(X_main, X_skip)\n", " self.loss = (self.Y_skip + self.Y_main).sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)\n", " # X (theirs): (N, C, W) -> X (mine): (N, W, C)\n", " # Y (theirs): (N, C, W) -> Y (mine): (N, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]\n", " grads = {\n", " \"X_main\": np.moveaxis(self.X_main.detach().numpy(), orig, X_swap),\n", " \"X_skip\": np.moveaxis(self.X_skip.detach().numpy(), orig, X_swap),\n", " \"conv_dilation_W\": np.moveaxis(\n", " self.conv_dilation.weight.detach().numpy(), orig, W_swap\n", " ),\n", " \"conv_dilation_b\": self.conv_dilation.bias.detach()\n", " .numpy()\n", " .reshape(1, 1, -1),\n", " \"conv_1x1_W\": np.moveaxis(\n", " self.conv_1x1.weight.detach().numpy(), orig, W_swap\n", " ),\n", " \"conv_1x1_b\": self.conv_1x1.bias.detach().numpy().reshape(1, 1, -1),\n", " \"conv_dilation_out\": np.moveaxis(\n", " self.conv_dilation_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"tanh_out\": np.moveaxis(self.tanh_out.detach().numpy(), orig, X_swap),\n", " \"sigm_out\": np.moveaxis(self.sigm_out.detach().numpy(), orig, X_swap),\n", " \"multiply_gate_out\": np.moveaxis(\n", " self.multiply_gate_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"conv_1x1_out\": np.moveaxis(\n", " self.conv_1x1_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"Y_main\": np.moveaxis(self.Y_main.detach().numpy(), orig, X_swap),\n", " \"Y_skip\": np.moveaxis(self.Y_skip.detach().numpy(), orig, X_swap),\n", " \"dLdY_skip\": np.moveaxis(self.Y_skip.grad.numpy(), orig, X_swap),\n", " \"dLdY_main\": np.moveaxis(self.Y_main.grad.numpy(), orig, X_swap),\n", " \"dLdConv_1x1_out\": np.moveaxis(\n", " self.conv_1x1_out.grad.numpy(), orig, X_swap\n", " ),\n", " \"dLdConv_1x1_W\": np.moveaxis(\n", " self.conv_1x1.weight.grad.numpy(), orig, W_swap\n", " ),\n", " \"dLdConv_1x1_b\": self.conv_1x1.bias.grad.numpy().reshape(1, 1, -1),\n", " \"dLdMultiply_out\": np.moveaxis(\n", " self.multiply_gate_out.grad.numpy(), orig, X_swap\n", " ),\n", " \"dLdTanh_out\": np.moveaxis(self.tanh_out.grad.numpy(), orig, X_swap),\n", " \"dLdSigm_out\": np.moveaxis(self.sigm_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv_dilation_out\": np.moveaxis(\n", " self.conv_dilation_out.grad.numpy(), orig, X_swap\n", " ),\n", " \"dLdConv_dilation_W\": np.moveaxis(\n", " self.conv_dilation.weight.grad.numpy(), orig, W_swap\n", " ),\n", " \"dLdConv_dilation_b\": self.conv_dilation.bias.grad.numpy().reshape(\n", " 1, 1, -1\n", " ),\n", " \"dLdX_main\": np.moveaxis(self.X_main.grad.numpy(), orig, X_swap),\n", " \"dLdX_skip\": np.moveaxis(self.X_skip.grad.numpy(), orig, X_swap),\n", " }\n", "\n", " return grads\n", "\n", "\n", "class TorchSkipConnectionConv(nn.Module):\n", " def __init__(\n", " self, act_fn, pad1, pad2, pad_skip, params, hparams, momentum=0.9, epsilon=1e-5\n", " ):\n", " super(TorchSkipConnectionConv, self).__init__()\n", "\n", " self.conv1 = nn.Conv2d(\n", " hparams[\"in_ch\"],\n", " hparams[\"out_ch1\"],\n", " hparams[\"kernel_shape1\"],\n", " padding=pad1,\n", " stride=hparams[\"stride1\"],\n", " bias=True,\n", " )\n", "\n", " self.act_fn = act_fn\n", "\n", " self.batchnorm1 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch1\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " self.conv2 = nn.Conv2d(\n", " hparams[\"out_ch1\"],\n", " hparams[\"out_ch2\"],\n", " hparams[\"kernel_shape2\"],\n", " padding=pad2,\n", " stride=hparams[\"stride2\"],\n", " bias=True,\n", " )\n", "\n", " self.batchnorm2 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch2\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " self.conv_skip = nn.Conv2d(\n", " hparams[\"in_ch\"],\n", " hparams[\"out_ch2\"],\n", " hparams[\"kernel_shape_skip\"],\n", " padding=pad_skip,\n", " stride=hparams[\"stride_skip\"],\n", " bias=True,\n", " )\n", "\n", " self.batchnorm_skip = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch2\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv1\"][\"W\"]\n", " b = params[\"components\"][\"conv1\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv1.weight.shape == W.shape\n", " assert self.conv1.bias.shape == b.flatten().shape\n", " self.conv1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm1\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm1\"][\"intercept\"]\n", " self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv2\"][\"W\"]\n", " b = params[\"components\"][\"conv2\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv2.weight.shape == W.shape\n", " assert self.conv2.bias.shape == b.flatten().shape\n", " self.conv2.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm2\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm2\"][\"intercept\"]\n", " self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " W = params[\"components\"][\"conv_skip\"][\"W\"]\n", " b = params[\"components\"][\"conv_skip\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv_skip.weight.shape == W.shape\n", " assert self.conv_skip.bias.shape == b.flatten().shape\n", " self.conv_skip.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv_skip.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm_skip\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm_skip\"][\"intercept\"]\n", " self.batchnorm_skip.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm_skip.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " if not isinstance(X, torch.Tensor):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.X.retain_grad()\n", "\n", " self.conv1_out = self.conv1(self.X)\n", " self.conv1_out.retain_grad()\n", "\n", " self.act_fn1_out = self.act_fn(self.conv1_out)\n", " self.act_fn1_out.retain_grad()\n", "\n", " self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)\n", " self.batchnorm1_out.retain_grad()\n", "\n", " self.conv2_out = self.conv2(self.batchnorm1_out)\n", " self.conv2_out.retain_grad()\n", "\n", " self.batchnorm2_out = self.batchnorm2(self.conv2_out)\n", " self.batchnorm2_out.retain_grad()\n", "\n", " self.c_skip_out = self.conv_skip(self.X)\n", " self.c_skip_out.retain_grad()\n", "\n", " self.bn_skip_out = self.batchnorm_skip(self.c_skip_out)\n", " self.bn_skip_out.retain_grad()\n", "\n", " self.layer3_in = self.batchnorm2_out + self.bn_skip_out\n", " self.layer3_in.retain_grad()\n", "\n", " self.Y = self.act_fn(self.layer3_in)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]\n", " grads = {\n", " # layer parameters\n", " \"conv1_W\": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),\n", " \"conv1_b\": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn1_intercept\": self.batchnorm1.bias.detach().numpy(),\n", " \"bn1_scaler\": self.batchnorm1.weight.detach().numpy(),\n", " \"bn1_running_mean\": self.batchnorm1.running_mean.detach().numpy(),\n", " \"bn1_running_var\": self.batchnorm1.running_var.detach().numpy(),\n", " \"conv2_W\": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),\n", " \"conv2_b\": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn2_intercept\": self.batchnorm2.bias.detach().numpy(),\n", " \"bn2_scaler\": self.batchnorm2.weight.detach().numpy(),\n", " \"bn2_running_mean\": self.batchnorm2.running_mean.detach().numpy(),\n", " \"bn2_running_var\": self.batchnorm2.running_var.detach().numpy(),\n", " \"conv_skip_W\": np.moveaxis(\n", " self.conv_skip.weight.detach().numpy(), orig, W_swap\n", " ),\n", " \"conv_skip_b\": self.conv_skip.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn_skip_intercept\": self.batchnorm_skip.bias.detach().numpy(),\n", " \"bn_skip_scaler\": self.batchnorm_skip.weight.detach().numpy(),\n", " \"bn_skip_running_mean\": self.batchnorm_skip.running_mean.detach().numpy(),\n", " \"bn_skip_running_var\": self.batchnorm_skip.running_var.detach().numpy(),\n", " # layer inputs/outputs (forward step)\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"conv1_out\": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),\n", " \"act1_out\": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),\n", " \"bn1_out\": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),\n", " \"conv2_out\": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),\n", " \"bn2_out\": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),\n", " \"conv_skip_out\": np.moveaxis(\n", " self.c_skip_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"bn_skip_out\": np.moveaxis(self.bn_skip_out.detach().numpy(), orig, X_swap),\n", " \"add_out\": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),\n", " \"Y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " # layer gradients (backward step)\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdAdd\": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),\n", " \"dLdBnSkip_out\": np.moveaxis(self.bn_skip_out.grad.numpy(), orig, X_swap),\n", " \"dLdConvSkip_out\": np.moveaxis(self.c_skip_out.grad.numpy(), orig, X_swap),\n", " \"dLdBn2_out\": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv2_out\": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),\n", " \"dLdBn1_out\": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),\n", " \"dLdActFn1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " # layer parameter gradients (backward step)\n", " \"dLdBnSkip_intercept\": self.batchnorm_skip.bias.grad.numpy(),\n", " \"dLdBnSkip_scaler\": self.batchnorm_skip.weight.grad.numpy(),\n", " \"dLdConvSkip_W\": np.moveaxis(\n", " self.conv_skip.weight.grad.numpy(), orig, W_swap\n", " ),\n", " \"dLdConvSkip_b\": self.conv_skip.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdBn2_intercept\": self.batchnorm2.bias.grad.numpy(),\n", " \"dLdBn2_scaler\": self.batchnorm2.weight.grad.numpy(),\n", " \"dLdConv2_W\": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv2_b\": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdBn1_intercept\": self.batchnorm1.bias.grad.numpy(),\n", " \"dLdBn1_scaler\": self.batchnorm1.weight.grad.numpy(),\n", " \"dLdConv1_W\": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv1_b\": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " }\n", " return grads\n", "\n", "\n", "class TorchBidirectionalLSTM(nn.Module):\n", " def __init__(self, n_in, n_out, params, **kwargs):\n", " super(TorchBidirectionalLSTM, self).__init__()\n", "\n", " self.layer1 = nn.LSTM(\n", " input_size=n_in,\n", " hidden_size=n_out,\n", " num_layers=1,\n", " bidirectional=True,\n", " bias=True,\n", " )\n", "\n", " Wiu = params[\"components\"][\"cell_fwd\"][\"Wu\"][n_out:, :].T\n", " Wif = params[\"components\"][\"cell_fwd\"][\"Wf\"][n_out:, :].T\n", " Wic = params[\"components\"][\"cell_fwd\"][\"Wc\"][n_out:, :].T\n", " Wio = params[\"components\"][\"cell_fwd\"][\"Wo\"][n_out:, :].T\n", " W_ih_f = np.vstack([Wiu, Wif, Wic, Wio])\n", "\n", " Whu = params[\"components\"][\"cell_fwd\"][\"Wu\"][:n_out, :].T\n", " Whf = params[\"components\"][\"cell_fwd\"][\"Wf\"][:n_out, :].T\n", " Whc = params[\"components\"][\"cell_fwd\"][\"Wc\"][:n_out, :].T\n", " Who = params[\"components\"][\"cell_fwd\"][\"Wo\"][:n_out, :].T\n", " W_hh_f = np.vstack([Whu, Whf, Whc, Who])\n", "\n", " assert self.layer1.weight_ih_l0.shape == W_ih_f.shape\n", " assert self.layer1.weight_hh_l0.shape == W_hh_f.shape\n", "\n", " self.layer1.weight_ih_l0 = nn.Parameter(torch.FloatTensor(W_ih_f))\n", " self.layer1.weight_hh_l0 = nn.Parameter(torch.FloatTensor(W_hh_f))\n", "\n", " Wiu = params[\"components\"][\"cell_bwd\"][\"Wu\"][n_out:, :].T\n", " Wif = params[\"components\"][\"cell_bwd\"][\"Wf\"][n_out:, :].T\n", " Wic = params[\"components\"][\"cell_bwd\"][\"Wc\"][n_out:, :].T\n", " Wio = params[\"components\"][\"cell_bwd\"][\"Wo\"][n_out:, :].T\n", " W_ih_b = np.vstack([Wiu, Wif, Wic, Wio])\n", "\n", " Whu = params[\"components\"][\"cell_bwd\"][\"Wu\"][:n_out, :].T\n", " Whf = params[\"components\"][\"cell_bwd\"][\"Wf\"][:n_out, :].T\n", " Whc = params[\"components\"][\"cell_bwd\"][\"Wc\"][:n_out, :].T\n", " Who = params[\"components\"][\"cell_bwd\"][\"Wo\"][:n_out, :].T\n", " W_hh_b = np.vstack([Whu, Whf, Whc, Who])\n", "\n", " assert self.layer1.weight_ih_l0_reverse.shape == W_ih_b.shape\n", " assert self.layer1.weight_hh_l0_reverse.shape == W_hh_b.shape\n", "\n", " self.layer1.weight_ih_l0_reverse = nn.Parameter(torch.FloatTensor(W_ih_b))\n", " self.layer1.weight_hh_l0_reverse = nn.Parameter(torch.FloatTensor(W_hh_b))\n", "\n", " b_f = np.concatenate(\n", " [\n", " params[\"components\"][\"cell_fwd\"][\"bu\"],\n", " params[\"components\"][\"cell_fwd\"][\"bf\"],\n", " params[\"components\"][\"cell_fwd\"][\"bc\"],\n", " params[\"components\"][\"cell_fwd\"][\"bo\"],\n", " ],\n", " axis=-1,\n", " ).flatten()\n", "\n", " assert self.layer1.bias_ih_l0.shape == b_f.shape\n", " assert self.layer1.bias_hh_l0.shape == b_f.shape\n", "\n", " self.layer1.bias_ih_l0 = nn.Parameter(torch.FloatTensor(b_f))\n", " self.layer1.bias_hh_l0 = nn.Parameter(torch.FloatTensor(b_f))\n", "\n", " b_b = np.concatenate(\n", " [\n", " params[\"components\"][\"cell_bwd\"][\"bu\"],\n", " params[\"components\"][\"cell_bwd\"][\"bf\"],\n", " params[\"components\"][\"cell_bwd\"][\"bc\"],\n", " params[\"components\"][\"cell_bwd\"][\"bo\"],\n", " ],\n", " axis=-1,\n", " ).flatten()\n", "\n", " assert self.layer1.bias_ih_l0_reverse.shape == b_b.shape\n", " assert self.layer1.bias_hh_l0_reverse.shape == b_b.shape\n", "\n", " self.layer1.bias_ih_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))\n", " self.layer1.bias_hh_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))\n", "\n", " def forward(self, X):\n", " # (batch, input_size, seq_len) -> (seq_len, batch, input_size)\n", " self.X = np.moveaxis(X, [0, 1, 2], [-2, -1, -3])\n", "\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " # initial hidden state is 0\n", " n_ex, n_in, n_timesteps = self.X.shape\n", " n_out, n_out = self.layer1.weight_hh_l0.shape\n", "\n", " # forward pass\n", " self.A, (At, Ct) = self.layer1(self.X)\n", " self.A.retain_grad()\n", " return self.A\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.A.sum()\n", " self.loss.backward()\n", "\n", " # forward\n", " w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0.chunk(4, 0)\n", " w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0.chunk(4, 0)\n", " bu_f, bf_f, bc_f, bo_f = self.layer1.bias_ih_l0.chunk(4, 0)\n", "\n", " Wu_f = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)\n", " Wf_f = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)\n", " Wc_f = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)\n", " Wo_f = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)\n", "\n", " dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0.grad.chunk(4, 0)\n", " dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0.grad.chunk(4, 0)\n", " dbu_f, dbf_f, dbc_f, dbo_f = self.layer1.bias_ih_l0.grad.chunk(4, 0)\n", "\n", " dWu_f = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)\n", " dWf_f = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)\n", " dWc_f = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)\n", " dWo_f = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)\n", "\n", " # backward\n", " w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0_reverse.chunk(4, 0)\n", " w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0_reverse.chunk(4, 0)\n", " bu_b, bf_b, bc_b, bo_b = self.layer1.bias_ih_l0_reverse.chunk(4, 0)\n", "\n", " Wu_b = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)\n", " Wf_b = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)\n", " Wc_b = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)\n", " Wo_b = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)\n", "\n", " dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0_reverse.grad.chunk(4, 0)\n", " dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0_reverse.grad.chunk(4, 0)\n", " dbu_b, dbf_b, dbc_b, dbo_b = self.layer1.bias_ih_l0_reverse.grad.chunk(4, 0)\n", "\n", " dWu_b = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)\n", " dWf_b = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)\n", " dWc_b = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)\n", " dWo_b = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)\n", "\n", " orig, X_swap = [0, 1, 2], [-1, -3, -2]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"Wu_f\": Wu_f.detach().numpy(),\n", " \"Wf_f\": Wf_f.detach().numpy(),\n", " \"Wc_f\": Wc_f.detach().numpy(),\n", " \"Wo_f\": Wo_f.detach().numpy(),\n", " \"bu_f\": bu_f.detach().numpy().reshape(-1, 1),\n", " \"bf_f\": bf_f.detach().numpy().reshape(-1, 1),\n", " \"bc_f\": bc_f.detach().numpy().reshape(-1, 1),\n", " \"bo_f\": bo_f.detach().numpy().reshape(-1, 1),\n", " \"Wu_b\": Wu_b.detach().numpy(),\n", " \"Wf_b\": Wf_b.detach().numpy(),\n", " \"Wc_b\": Wc_b.detach().numpy(),\n", " \"Wo_b\": Wo_b.detach().numpy(),\n", " \"bu_b\": bu_b.detach().numpy().reshape(-1, 1),\n", " \"bf_b\": bf_b.detach().numpy().reshape(-1, 1),\n", " \"bc_b\": bc_b.detach().numpy().reshape(-1, 1),\n", " \"bo_b\": bo_b.detach().numpy().reshape(-1, 1),\n", " \"y\": np.moveaxis(self.A.detach().numpy(), orig, X_swap),\n", " \"dLdA\": self.A.grad.numpy(),\n", " \"dLdWu_f\": dWu_f.numpy(),\n", " \"dLdWf_f\": dWf_f.numpy(),\n", " \"dLdWc_f\": dWc_f.numpy(),\n", " \"dLdWo_f\": dWo_f.numpy(),\n", " \"dLdBu_f\": dbu_f.numpy().reshape(-1, 1),\n", " \"dLdBf_f\": dbf_f.numpy().reshape(-1, 1),\n", " \"dLdBc_f\": dbc_f.numpy().reshape(-1, 1),\n", " \"dLdBo_f\": dbo_f.numpy().reshape(-1, 1),\n", " \"dLdWu_b\": dWu_b.numpy(),\n", " \"dLdWf_b\": dWf_b.numpy(),\n", " \"dLdWc_b\": dWc_b.numpy(),\n", " \"dLdWo_b\": dWo_b.numpy(),\n", " \"dLdBu_b\": dbu_b.numpy().reshape(-1, 1),\n", " \"dLdBf_b\": dbf_b.numpy().reshape(-1, 1),\n", " \"dLdBc_b\": dbc_b.numpy().reshape(-1, 1),\n", " \"dLdBo_b\": dbo_b.numpy().reshape(-1, 1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchPool2DLayer(nn.Module):\n", " def __init__(self, in_channels, hparams, **kwargs):\n", " super(TorchPool2DLayer, self).__init__()\n", "\n", " if hparams[\"mode\"] == \"max\":\n", " self.layer1 = nn.MaxPool2d(\n", " kernel_size=hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " )\n", " elif hparams[\"mode\"] == \"average\":\n", " self.layer1 = nn.AvgPool2d(\n", " kernel_size=hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " )\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", " self.Y = self.layer1(self.X)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)\n", " # X (theirs): (N, C, H, W) -> X (mine): (N, H, W, C)\n", " # Y (theirs): (N, C, H, W) -> Y (mine): (N, H, W, C)\n", " orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchConv2DLayer(nn.Module):\n", " def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):\n", " super(TorchConv2DLayer, self).__init__()\n", "\n", " W = params[\"W\"]\n", " b = params[\"b\"]\n", " self.act_fn = act_fn\n", "\n", " self.layer1 = nn.Conv2d(\n", " in_channels,\n", " out_channels,\n", " hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " dilation=hparams[\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -3, -4])\n", " assert self.layer1.weight.shape == W.shape\n", " assert self.layer1.bias.shape == b.flatten().shape\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " self.Z = self.layer1(self.X)\n", " self.Z.retain_grad()\n", "\n", " self.Y = self.act_fn(self.Z)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)\n", " # X (theirs): (N, C, H, W) -> X (mine): (N, H, W, C)\n", " # Y (theirs): (N, C, H, W) -> Y (mine): (N, H, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"W\": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),\n", " \"b\": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdZ\": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),\n", " \"dLdW\": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdB\": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchConv1DLayer(nn.Module):\n", " def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):\n", " super(TorchConv1DLayer, self).__init__()\n", "\n", " W = params[\"W\"]\n", " b = params[\"b\"]\n", " self.act_fn = act_fn\n", "\n", " self.layer1 = nn.Conv1d(\n", " in_channels,\n", " out_channels,\n", " hparams[\"kernel_width\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " dilation=hparams[\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])\n", " assert self.layer1.weight.shape == W.shape\n", " assert self.layer1.bias.shape == b.flatten().shape\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " def forward(self, X):\n", " # (N, W, C) -> (N, C, W)\n", " self.X = np.moveaxis(X, [0, 1, 2], [0, -1, -2])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " self.Z = self.layer1(self.X)\n", " self.Z.retain_grad()\n", "\n", " self.Y = self.act_fn(self.Z)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)\n", " # X (theirs): (N, C, W) -> X (mine): (N, W, C)\n", " # Y (theirs): (N, C, W) -> Y (mine): (N, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"W\": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),\n", " \"b\": self.layer1.bias.detach().numpy().reshape(1, 1, -1),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdZ\": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),\n", " \"dLdW\": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdB\": self.layer1.bias.grad.numpy().reshape(1, 1, -1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchDeconv2DLayer(nn.Module):\n", " def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):\n", " super(TorchDeconv2DLayer, self).__init__()\n", "\n", " W = params[\"W\"]\n", " b = params[\"b\"]\n", " self.act_fn = act_fn\n", "\n", " self.layer1 = nn.ConvTranspose2d(\n", " in_channels,\n", " out_channels,\n", " hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " dilation=1,\n", " bias=True,\n", " )\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_in, n_out, f[0], f[1])\n", " W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -4, -3])\n", " assert self.layer1.weight.shape == W.shape\n", " assert self.layer1.bias.shape == b.flatten().shape\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " self.Z = self.layer1(self.X)\n", " self.Z.retain_grad()\n", "\n", " self.Y = self.act_fn(self.Z)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_in, n_out, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)\n", " # X (theirs): (N, C, H, W) -> X (mine): (N, H, W, C)\n", " # Y (theirs): (N, C, H, W) -> Y (mine): (N, H, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-2, -1, -4, -3]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"W\": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),\n", " \"b\": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdZ\": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),\n", " \"dLdW\": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdB\": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchLSTMCell(nn.Module):\n", " def __init__(self, n_in, n_out, params, **kwargs):\n", " super(TorchLSTMCell, self).__init__()\n", "\n", " Wiu = params[\"Wu\"][n_out:, :].T\n", " Wif = params[\"Wf\"][n_out:, :].T\n", " Wic = params[\"Wc\"][n_out:, :].T\n", " Wio = params[\"Wo\"][n_out:, :].T\n", " W_ih = np.vstack([Wiu, Wif, Wic, Wio])\n", "\n", " Whu = params[\"Wu\"][:n_out, :].T\n", " Whf = params[\"Wf\"][:n_out, :].T\n", " Whc = params[\"Wc\"][:n_out, :].T\n", " Who = params[\"Wo\"][:n_out, :].T\n", " W_hh = np.vstack([Whu, Whf, Whc, Who])\n", "\n", " self.layer1 = nn.LSTMCell(input_size=n_in, hidden_size=n_out, bias=True)\n", " assert self.layer1.weight_ih.shape == W_ih.shape\n", " assert self.layer1.weight_hh.shape == W_hh.shape\n", " self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(W_ih))\n", " self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(W_hh))\n", "\n", " b = np.concatenate(\n", " [params[\"bu\"], params[\"bf\"], params[\"bc\"], params[\"bo\"]], axis=-1\n", " ).flatten()\n", " assert self.layer1.bias_ih.shape == b.shape\n", " assert self.layer1.bias_hh.shape == b.shape\n", " self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(b))\n", " self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(b))\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " # initial hidden state is 0\n", " n_ex, n_in, n_timesteps = self.X.shape\n", " n_out, n_out = self.layer1.weight_hh.shape\n", "\n", " # initialize hidden states\n", " a0 = torchify(np.zeros((n_ex, n_out)))\n", " c0 = torchify(np.zeros((n_ex, n_out)))\n", " a0.retain_grad()\n", " c0.retain_grad()\n", "\n", " # forward pass\n", " A, C = [], []\n", " at = a0\n", " ct = c0\n", " for t in range(n_timesteps):\n", " A.append(at)\n", " C.append(ct)\n", " at1, ct1 = self.layer1(self.X[:, :, t], (at, ct))\n", " at.retain_grad()\n", " ct.retain_grad()\n", " at = at1\n", " ct = ct1\n", "\n", " at.retain_grad()\n", " ct.retain_grad()\n", " A.append(at)\n", " C.append(ct)\n", "\n", " # don't inclue a0 in our outputs\n", " self.A = A[1:]\n", " self.C = C[1:]\n", " return self.A, self.C\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = torch.stack(self.A).sum()\n", " self.loss.backward()\n", "\n", " w_ii, w_if, w_ic, w_io = self.layer1.weight_ih.chunk(4, 0)\n", " w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh.chunk(4, 0)\n", " bu, bf, bc, bo = self.layer1.bias_ih.chunk(4, 0)\n", "\n", " Wu = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)\n", " Wf = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)\n", " Wc = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)\n", " Wo = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)\n", "\n", " dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih.grad.chunk(4, 0)\n", " dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh.grad.chunk(4, 0)\n", " dbu, dbf, dbc, dbo = self.layer1.bias_ih.grad.chunk(4, 0)\n", "\n", " dWu = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)\n", " dWf = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)\n", " dWc = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)\n", " dWo = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)\n", "\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"Wu\": Wu.detach().numpy(),\n", " \"Wf\": Wf.detach().numpy(),\n", " \"Wc\": Wc.detach().numpy(),\n", " \"Wo\": Wo.detach().numpy(),\n", " \"bu\": bu.detach().numpy().reshape(-1, 1),\n", " \"bf\": bf.detach().numpy().reshape(-1, 1),\n", " \"bc\": bc.detach().numpy().reshape(-1, 1),\n", " \"bo\": bo.detach().numpy().reshape(-1, 1),\n", " \"C\": torch.stack(self.C).detach().numpy(),\n", " \"y\": np.swapaxes(\n", " np.swapaxes(torch.stack(self.A).detach().numpy(), 1, 0), 1, 2\n", " ),\n", " \"dLdA\": np.array([a.grad.numpy() for a in self.A]),\n", " \"dLdWu\": dWu.numpy(),\n", " \"dLdWf\": dWf.numpy(),\n", " \"dLdWc\": dWc.numpy(),\n", " \"dLdWo\": dWo.numpy(),\n", " \"dLdBu\": dbu.numpy().reshape(-1, 1),\n", " \"dLdBf\": dbf.numpy().reshape(-1, 1),\n", " \"dLdBc\": dbc.numpy().reshape(-1, 1),\n", " \"dLdBo\": dbo.numpy().reshape(-1, 1),\n", " \"dLdX\": self.X.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchRNNCell(nn.Module):\n", " def __init__(self, n_in, n_hid, params, **kwargs):\n", " super(TorchRNNCell, self).__init__()\n", "\n", " self.layer1 = nn.RNNCell(n_in, n_hid, bias=True, nonlinearity=\"tanh\")\n", "\n", " # set weights and bias to match those of RNNCell\n", " # NB: we pass the *transpose* of the RNNCell weights and biases to\n", " # pytorch, meaning we need to check against the *transpose* of our\n", " # outputs for any function of the weights\n", " self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(params[\"Wax\"].T))\n", " self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(params[\"Waa\"].T))\n", " self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(params[\"bx\"].T))\n", " self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(params[\"ba\"].T))\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " # initial hidden state is 0\n", " n_ex, n_in, n_timesteps = self.X.shape\n", " n_out, n_out = self.layer1.weight_hh.shape\n", "\n", " # initialize hidden states\n", " a0 = torchify(np.zeros((n_ex, n_out)))\n", " a0.retain_grad()\n", "\n", " # forward pass\n", " A = []\n", " at = a0\n", " for t in range(n_timesteps):\n", " A += [at]\n", " at1 = self.layer1(self.X[:, :, t], at)\n", " at.retain_grad()\n", " at = at1\n", "\n", " at.retain_grad()\n", " A += [at]\n", "\n", " # don't inclue a0 in our outputs\n", " self.A = A[1:]\n", " return self.A\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = torch.stack(self.A).sum()\n", " self.loss.backward()\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"ba\": self.layer1.bias_hh.detach().numpy(),\n", " \"bx\": self.layer1.bias_ih.detach().numpy(),\n", " \"Wax\": self.layer1.weight_ih.detach().numpy(),\n", " \"Waa\": self.layer1.weight_hh.detach().numpy(),\n", " \"y\": torch.stack(self.A).detach().numpy(),\n", " \"dLdA\": np.array([a.grad.numpy() for a in self.A]),\n", " \"dLdWaa\": self.layer1.weight_hh.grad.numpy(),\n", " \"dLdWax\": self.layer1.weight_ih.grad.numpy(),\n", " \"dLdBa\": self.layer1.bias_hh.grad.numpy(),\n", " \"dLdBx\": self.layer1.bias_ih.grad.numpy(),\n", " \"dLdX\": self.X.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchFCLayer(nn.Module):\n", " def __init__(self, n_in, n_hid, act_fn, params, **kwargs):\n", " super(TorchFCLayer, self).__init__()\n", " self.layer1 = nn.Linear(n_in, n_hid)\n", "\n", " # explicitly set weights and bias\n", " # NB: we pass the *transpose* of the weights to pytorch, meaning\n", " # we'll need to check against the *transpose* of our outputs for\n", " # any function of the weights\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(params[\"W\"].T))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(params[\"b\"]))\n", "\n", " self.act_fn = act_fn\n", " self.model = nn.Sequential(self.layer1, self.act_fn)\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(X, torch.Tensor):\n", " self.X = torchify(X)\n", "\n", " self.z1 = self.layer1(self.X)\n", " self.z1.retain_grad()\n", "\n", " self.out1 = self.act_fn(self.z1)\n", " self.out1.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss1 = self.out1.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"b\": self.layer1.bias.detach().numpy(),\n", " \"W\": self.layer1.weight.detach().numpy(),\n", " \"y\": self.out1.detach().numpy(),\n", " \"dLdy\": self.out1.grad.numpy(),\n", " \"dLdZ\": self.z1.grad.numpy(),\n", " \"dLdB\": self.layer1.bias.grad.numpy(),\n", " \"dLdW\": self.layer1.weight.grad.numpy(),\n", " \"dLdX\": self.X.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchEmbeddingLayer(nn.Module):\n", " def __init__(self, vocab_size, n_out, params, **kwargs):\n", " super(TorchEmbeddingLayer, self).__init__()\n", " self.layer1 = nn.Embedding(vocab_size, n_out)\n", "\n", " # explicitly set embedding weights\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(params[\"W\"]))\n", " self.model = nn.Sequential(self.layer1)\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(X, torch.Tensor):\n", " self.X = torch.from_numpy(X)\n", "\n", " self.out1 = self.layer1(self.X)\n", " self.out1.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss1 = self.out1.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"W\": self.layer1.weight.detach().numpy(),\n", " \"y\": self.out1.detach().numpy(),\n", " \"dLdy\": self.out1.grad.numpy(),\n", " \"dLdW\": self.layer1.weight.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchSDPAttentionLayer(nn.Module):\n", " def __init__(self):\n", " super(TorchSDPAttentionLayer, self).__init__()\n", "\n", " def forward(self, Q, K, V, mask=None):\n", " self.Q = Q\n", " self.K = K\n", " self.V = V\n", "\n", " if not isinstance(self.Q, torch.Tensor):\n", " self.Q = torchify(self.Q)\n", " if not isinstance(self.K, torch.Tensor):\n", " self.K = torchify(self.K)\n", " if not isinstance(self.V, torch.Tensor):\n", " self.V = torchify(self.V)\n", "\n", " self.Q.retain_grad()\n", " self.K.retain_grad()\n", " self.V.retain_grad()\n", "\n", " self.d_k = self.Q.size(-1)\n", " self.scores = torch.matmul(self.Q, self.K.transpose(-2, -1)) / np.sqrt(self.d_k)\n", " if mask is not None:\n", " self.scores = self.scores.masked_fill(mask == 0, -1e9)\n", " self.scores.retain_grad()\n", "\n", " self.weights = F.softmax(self.scores, dim=-1)\n", " self.weights.retain_grad()\n", " self.Y = torch.matmul(self.weights, self.V)\n", " self.Y.retain_grad()\n", " return self.Y, self.weights\n", "\n", " def extract_grads(self, Q, K, V, mask=None):\n", " self.forward(Q, K, V, mask=mask)\n", " self.loss1 = self.Y.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"Q\": self.Q.detach().numpy(),\n", " \"K\": self.K.detach().numpy(),\n", " \"V\": self.V.detach().numpy(),\n", " \"d_k\": self.d_k,\n", " \"scores\": self.scores.detach().numpy(),\n", " \"weights\": self.weights.detach().numpy(),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dLdV\": self.V.grad.numpy(),\n", " \"dWeights\": self.weights.grad.numpy(),\n", " \"dScores\": self.scores.grad.numpy(),\n", " \"dLdQ\": self.Q.grad.numpy(),\n", " \"dLdK\": self.K.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchMultiHeadedAttentionModule(nn.Module):\n", " def __init__(self, params, hparams):\n", " \"Take in model size and number of heads.\"\n", " super(TorchMultiHeadedAttentionModule, self).__init__()\n", " assert hparams[\"kqv_dim\"] % hparams[\"n_heads\"] == 0\n", " self.n_heads = hparams[\"n_heads\"]\n", " self.latent_dim = hparams[\"kqv_dim\"] // hparams[\"n_heads\"]\n", " self.p_dropout = hparams[\"dropout_p\"]\n", " self.projections = {\n", " \"Q\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " \"K\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " \"V\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " \"O\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " }\n", " self.projections[\"Q\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"Q\"][\"W\"].T)\n", " )\n", " self.projections[\"Q\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"Q\"][\"b\"])\n", " )\n", " self.projections[\"K\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"K\"][\"W\"].T)\n", " )\n", " self.projections[\"K\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"K\"][\"b\"])\n", " )\n", " self.projections[\"V\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"V\"][\"W\"].T)\n", " )\n", " self.projections[\"V\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"V\"][\"b\"])\n", " )\n", " self.projections[\"O\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"O\"][\"W\"].T)\n", " )\n", " self.projections[\"O\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"O\"][\"b\"])\n", " )\n", "\n", " self.attn = None\n", " self.dropout = nn.Dropout(p=hparams[\"dropout_p\"])\n", "\n", " def forward(self, Q, K, V, mask=None):\n", " self.Q = Q\n", " self.K = K\n", " self.V = V\n", "\n", " if not isinstance(self.Q, torch.Tensor):\n", " self.Q = torchify(self.Q)\n", " if not isinstance(self.K, torch.Tensor):\n", " self.K = torchify(self.K)\n", " if not isinstance(self.V, torch.Tensor):\n", " self.V = torchify(self.V)\n", "\n", " self.Q.retain_grad()\n", " self.K.retain_grad()\n", " self.V.retain_grad()\n", "\n", " if mask is not None:\n", " # Same mask applied to all h heads.\n", " mask = mask.unsqueeze(1)\n", " n_ex = self.Q.size(0)\n", "\n", " self.Q_proj = (\n", " self.projections[\"Q\"](self.Q)\n", " .view(n_ex, -1, self.n_heads, self.latent_dim)\n", " .transpose(1, 2)\n", " )\n", "\n", " self.K_proj = (\n", " self.projections[\"K\"](self.K)\n", " .view(n_ex, -1, self.n_heads, self.latent_dim)\n", " .transpose(1, 2)\n", " )\n", "\n", " self.V_proj = (\n", " self.projections[\"V\"](self.V)\n", " .view(n_ex, -1, self.n_heads, self.latent_dim)\n", " .transpose(1, 2)\n", " )\n", "\n", " self.Q_proj.retain_grad()\n", " self.K_proj.retain_grad()\n", " self.V_proj.retain_grad()\n", "\n", " # 2) Apply attention on all the projected vectors in batch.\n", " self.attn_out, self.attn = TorchSDPAttentionLayer().forward(\n", " self.Q_proj, self.K_proj, self.V_proj, mask=mask\n", " )\n", " self.attn.retain_grad()\n", " self.attn_out.retain_grad()\n", "\n", " # 3) \"Concat\" using a view and apply a final linear transformation\n", " self.attn_out_reshaped = (\n", " self.attn_out.transpose(1, 2)\n", " .contiguous()\n", " .view(n_ex, -1, self.n_heads * self.latent_dim)\n", " )\n", " self.attn_out_reshaped.retain_grad()\n", " print(self.attn_out_reshaped.shape)\n", " self.Y = self.projections[\"O\"](self.attn_out_reshaped)\n", " print(self.Y.shape)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, Q, K, V, mask=None):\n", " self.forward(Q, K, V, mask=mask)\n", " self.loss1 = self.Y.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"Q\": self.Q.detach().numpy(),\n", " \"K\": self.K.detach().numpy(),\n", " \"V\": self.V.detach().numpy(),\n", " \"O_W\": self.projections[\"O\"].weight.detach().numpy().T,\n", " \"V_W\": self.projections[\"V\"].weight.detach().numpy().T,\n", " \"K_W\": self.projections[\"K\"].weight.detach().numpy().T,\n", " \"Q_W\": self.projections[\"Q\"].weight.detach().numpy().T,\n", " \"O_b\": self.projections[\"O\"].bias.detach().numpy(),\n", " \"V_b\": self.projections[\"V\"].bias.detach().numpy(),\n", " \"K_b\": self.projections[\"K\"].bias.detach().numpy(),\n", " \"Q_b\": self.projections[\"Q\"].bias.detach().numpy(),\n", " \"latent_dim\": self.latent_dim,\n", " \"n_heads\": self.n_heads,\n", " \"Q_proj\": self.Q_proj.detach().numpy(), # .reshape(self.Q_proj.shape[0], -1),\n", " \"K_proj\": self.K_proj.detach().numpy(), # .reshape(self.K_proj.shape[0], -1),\n", " \"V_proj\": self.V_proj.detach().numpy(), # .reshape(self.V_proj.shape[0], -1),\n", " \"weights\": self.attn.detach().numpy(),\n", " \"attn_out\": self.attn_out_reshaped.detach().numpy(), # .squeeze(),\n", " # .reshape(self.attn_out_reshaped.shape[0], -1),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dO_W\": self.projections[\"O\"].weight.grad.numpy().T,\n", " \"dV_W\": self.projections[\"V\"].weight.grad.numpy().T,\n", " \"dK_W\": self.projections[\"K\"].weight.grad.numpy().T,\n", " \"dQ_W\": self.projections[\"Q\"].weight.grad.numpy().T,\n", " \"dO_b\": self.projections[\"O\"].bias.grad.numpy(),\n", " \"dV_b\": self.projections[\"V\"].bias.grad.numpy(),\n", " \"dK_b\": self.projections[\"K\"].bias.grad.numpy(),\n", " \"dQ_b\": self.projections[\"Q\"].bias.grad.numpy(),\n", " \"dLdy\": self.Y.grad.numpy(),\n", " \"dAttn_out\": self.attn_out_reshaped.grad.numpy(),\n", " \"dWeights\": self.attn.grad.numpy(),\n", " \"dQ_proj\": self.Q_proj.grad.numpy(),\n", " \"dK_proj\": self.K_proj.grad.numpy(),\n", " \"dV_proj\": self.V_proj.grad.numpy(),\n", " \"dQ\": self.Q.grad.numpy(),\n", " \"dK\": self.K.grad.numpy(),\n", " \"dV\": self.V.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "# #######################################################################\n", "# # TF WGAN GP Gold Standard Implementation #\n", "# # adapted from: https://github.com/igul222/improved_wgan_training/ #\n", "# #######################################################################\n", "\n", "# _params = {}\n", "# _param_aliases = {}\n", "\n", "\n", "# def param(name, *args, **kwargs):\n", "# \"\"\"\n", "# A wrapper for `tf.Variable` which enables parameter sharing in models.\n", "\n", "# Creates and returns theano shared variables similarly to `tf.Variable`,\n", "# except if you try to create a param with the same name as a\n", "# previously-created one, `param(...)` will just return the old one instead of\n", "# making a new one.\n", "\n", "# This constructor also adds a `param` attribute to the shared variables it\n", "# creates, so that you can easily search a graph for all params.\n", "# \"\"\"\n", "\n", "# if name not in _params:\n", "# kwargs[\"name\"] = name\n", "# param = tf.Variable(*args, **kwargs)\n", "# param.param = True\n", "# _params[name] = param\n", "# result = _params[name]\n", "# i = 0\n", "# while result in _param_aliases:\n", "# i += 1\n", "# result = _param_aliases[result]\n", "# return result\n", "\n", "\n", "# def params_with_name(name):\n", "# return [p for n, p in _params.items() if name in n]\n", "\n", "\n", "# def ReLULayer(name, n_in, n_out, inputs, w_initialization):\n", "# if isinstance(w_initialization, np.ndarray):\n", "# weight_values = w_initialization.astype(\"float32\")\n", "\n", "# W = param(name + \".W\", weight_values)\n", "# result = tf.matmul(inputs, W)\n", "# output = tf.nn.bias_add(\n", "# result, param(name + \".b\", np.zeros((n_out,), dtype=\"float32\"))\n", "# )\n", "# output = tf.nn.relu(output)\n", "# return output, W\n", "\n", "\n", "# def LinearLayer(name, n_in, n_out, inputs, w_initialization):\n", "# if isinstance(w_initialization, np.ndarray):\n", "# weight_values = w_initialization.astype(\"float32\")\n", "\n", "# W = param(name + \".W\", weight_values)\n", "# result = tf.matmul(inputs, W)\n", "# output = tf.nn.bias_add(\n", "# result, param(name + \".b\", np.zeros((n_out,), dtype=\"float32\"))\n", "# )\n", "# return output, W\n", "\n", "\n", "# def Generator(n_samples, X_real, params=None):\n", "# n_feats = 2\n", "# W1 = W2 = W3 = W4 = \"he\"\n", "# noise = tf.random.normal([n_samples, 2])\n", "# if params is not None:\n", "# noise = tf.convert_to_tensor(params[\"noise\"], dtype=\"float32\")\n", "# W1 = params[\"generator\"][\"FC1\"][\"W\"]\n", "# W2 = params[\"generator\"][\"FC2\"][\"W\"]\n", "# W3 = params[\"generator\"][\"FC3\"][\"W\"]\n", "# W4 = params[\"generator\"][\"FC4\"][\"W\"]\n", "# DIM = params[\"g_hidden\"]\n", "# n_feats = params[\"n_in\"]\n", "\n", "# outs = {}\n", "# weights = {}\n", "# output, W = ReLULayer(\"Generator.1\", n_feats, DIM, noise, w_initialization=W1)\n", "# outs[\"FC1\"] = output\n", "# weights[\"FC1\"] = W\n", "# output, W = ReLULayer(\"Generator.2\", DIM, DIM, output, w_initialization=W2)\n", "# outs[\"FC2\"] = output\n", "# weights[\"FC2\"] = W\n", "# output, W = ReLULayer(\"Generator.3\", DIM, DIM, output, w_initialization=W3)\n", "# outs[\"FC3\"] = output\n", "# weights[\"FC3\"] = W\n", "# output, W = LinearLayer(\"Generator.4\", DIM, n_feats, output, w_initialization=W4)\n", "# outs[\"FC4\"] = output\n", "# weights[\"FC4\"] = W\n", "# return output, outs, weights\n", "\n", "\n", "# def Discriminator(inputs, params=None):\n", "# n_feats = 2\n", "# W1 = W2 = W3 = W4 = \"he\"\n", "# if params is not None:\n", "# W1 = params[\"critic\"][\"FC1\"][\"W\"]\n", "# W2 = params[\"critic\"][\"FC2\"][\"W\"]\n", "# W3 = params[\"critic\"][\"FC3\"][\"W\"]\n", "# W4 = params[\"critic\"][\"FC4\"][\"W\"]\n", "# DIM = params[\"g_hidden\"]\n", "# n_feats = params[\"n_in\"]\n", "\n", "# outs = {}\n", "# weights = {}\n", "# output, W = ReLULayer(\"Discriminator.1\", n_feats, DIM, inputs, w_initialization=W1)\n", "# outs[\"FC1\"] = output\n", "# weights[\"FC1\"] = W\n", "\n", "# output, W = ReLULayer(\"Discriminator.2\", DIM, DIM, output, w_initialization=W2)\n", "# outs[\"FC2\"] = output\n", "# weights[\"FC2\"] = W\n", "\n", "# output, W = ReLULayer(\"Discriminator.3\", DIM, DIM, output, w_initialization=W3)\n", "# outs[\"FC3\"] = output\n", "# weights[\"FC3\"] = W\n", "\n", "# output, W = LinearLayer(\"Discriminator.4\", DIM, 1, output, w_initialization=W4)\n", "# outs[\"FC4\"] = output\n", "# weights[\"FC4\"] = W\n", "\n", "# # get bias\n", "# for var in params_with_name(\"Discriminator\"):\n", "# if \"1.b:\" in var.name:\n", "# weights[\"FC1_b\"] = var\n", "# elif \"2.b:\" in var.name:\n", "# weights[\"FC2_b\"] = var\n", "# elif \"3.b:\" in var.name:\n", "# weights[\"FC3_b\"] = var\n", "# elif \"4.b:\" in var.name:\n", "# weights[\"FC4_b\"] = var\n", "\n", "# return tf.reshape(output, [-1]), outs, weights\n", "\n", "\n", "# def WGAN_GP_tf(X, lambda_, params, batch_size):\n", "# tf.compat.v1.disable_eager_execution()\n", "\n", "# batch_size = X.shape[0]\n", "\n", "# # get alpha value\n", "# n_steps = params[\"n_steps\"]\n", "# c_updates_per_epoch = params[\"c_updates_per_epoch\"]\n", "# alpha = tf.convert_to_tensor(params[\"alpha\"], dtype=\"float32\")\n", "\n", "# X_real = tf.compat.v1.placeholder(tf.float32, shape=[None, params[\"n_in\"]])\n", "# X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params)\n", "\n", "# Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params)\n", "# Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params)\n", "\n", "# # WGAN loss\n", "# mean_fake = tf.reduce_mean(Y_fake)\n", "# mean_real = tf.reduce_mean(Y_real)\n", "\n", "# C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real)\n", "# G_loss = -tf.reduce_mean(Y_fake)\n", "\n", "# # WGAN gradient penalty\n", "# X_interp = alpha * X_real + ((1 - alpha) * X_fake)\n", "# Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params)\n", "# gradInterp = tf.gradients(Y_interp, [X_interp])[0]\n", "\n", "# norm_gradInterp = tf.sqrt(\n", "# tf.compat.v1.reduce_sum(tf.square(gradInterp), reduction_indices=[1])\n", "# )\n", "# gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2)\n", "# C_loss += lambda_ * gradient_penalty\n", "\n", "# # extract gradient of Y_interp wrt. each layer output in critic\n", "# C_bwd_Y_interp = {}\n", "# for k, v in C_out_Y_interp.items():\n", "# C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0]\n", "\n", "# C_bwd_W = {}\n", "# for k, v in C_Y_interp_weights.items():\n", "# C_bwd_W[k] = tf.gradients(C_loss, [v])[0]\n", "\n", "# # get gradients\n", "# dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0]\n", "# dC_Y_real = tf.gradients(C_loss, [Y_real])[0]\n", "# dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0]\n", "# dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0]\n", "\n", "# with tf.compat.v1.Session() as session:\n", "# session.run(tf.compat.v1.global_variables_initializer())\n", "\n", "# for iteration in range(n_steps):\n", "# # Train critic\n", "# for i in range(c_updates_per_epoch):\n", "# _data = X\n", "# (\n", "# _alpha,\n", "# _X_interp,\n", "# _Y_interp,\n", "# _gradInterp,\n", "# _norm_gradInterp,\n", "# _gradient_penalty,\n", "# _C_loss,\n", "# _X_fake,\n", "# _Y_fake,\n", "# _Y_real,\n", "# _dC_Y_fake,\n", "# _dC_Y_real,\n", "# _dC_gradInterp,\n", "# _dG_Y_fake,\n", "# _mean_fake,\n", "# _mean_real,\n", "# _G_weights_FC1,\n", "# _G_weights_FC2,\n", "# _G_weights_FC3,\n", "# _G_weights_FC4,\n", "# _G_fwd_X_fake_FC1,\n", "# _G_fwd_X_fake_FC2,\n", "# _G_fwd_X_fake_FC3,\n", "# _G_fwd_X_fake_FC4,\n", "# _C_weights_Y_fake_FC1,\n", "# _C_weights_Y_fake_FC2,\n", "# _C_weights_Y_fake_FC3,\n", "# _C_weights_Y_fake_FC4,\n", "# _C_fwd_Y_fake_FC1,\n", "# _C_fwd_Y_fake_FC2,\n", "# _C_fwd_Y_fake_FC3,\n", "# _C_fwd_Y_fake_FC4,\n", "# _C_weights_Y_real_FC1,\n", "# _C_weights_Y_real_FC2,\n", "# _C_weights_Y_real_FC3,\n", "# _C_weights_Y_real_FC4,\n", "# _C_fwd_Y_real_FC1,\n", "# _C_fwd_Y_real_FC2,\n", "# _C_fwd_Y_real_FC3,\n", "# _C_fwd_Y_real_FC4,\n", "# _C_weights_Y_interp_FC1,\n", "# _C_weights_Y_interp_FC2,\n", "# _C_weights_Y_interp_FC3,\n", "# _C_weights_Y_interp_FC4,\n", "# _C_dY_interp_wrt_FC1,\n", "# _C_dY_interp_wrt_FC2,\n", "# _C_dY_interp_wrt_FC3,\n", "# _C_dY_interp_wrt_FC4,\n", "# _C_fwd_Y_interp_FC1,\n", "# _C_fwd_Y_interp_FC2,\n", "# _C_fwd_Y_interp_FC3,\n", "# _C_fwd_Y_interp_FC4,\n", "# _C_dW_FC1,\n", "# _C_db_FC1,\n", "# _C_dW_FC2,\n", "# _C_db_FC2,\n", "# _C_dW_FC3,\n", "# _C_db_FC3,\n", "# _C_dW_FC4,\n", "# _C_db_FC4,\n", "# ) = session.run(\n", "# [\n", "# alpha,\n", "# X_interp,\n", "# Y_interp,\n", "# gradInterp,\n", "# norm_gradInterp,\n", "# gradient_penalty,\n", "# C_loss,\n", "# X_fake,\n", "# Y_fake,\n", "# Y_real,\n", "# dC_Y_fake,\n", "# dC_Y_real,\n", "# dC_gradInterp,\n", "# dG_Y_fake,\n", "# mean_fake,\n", "# mean_real,\n", "# G_weights[\"FC1\"],\n", "# G_weights[\"FC2\"],\n", "# G_weights[\"FC3\"],\n", "# G_weights[\"FC4\"],\n", "# G_out_X_fake[\"FC1\"],\n", "# G_out_X_fake[\"FC2\"],\n", "# G_out_X_fake[\"FC3\"],\n", "# G_out_X_fake[\"FC4\"],\n", "# C_Y_fake_weights[\"FC1\"],\n", "# C_Y_fake_weights[\"FC2\"],\n", "# C_Y_fake_weights[\"FC3\"],\n", "# C_Y_fake_weights[\"FC4\"],\n", "# C_out_Y_fake[\"FC1\"],\n", "# C_out_Y_fake[\"FC2\"],\n", "# C_out_Y_fake[\"FC3\"],\n", "# C_out_Y_fake[\"FC4\"],\n", "# C_Y_real_weights[\"FC1\"],\n", "# C_Y_real_weights[\"FC2\"],\n", "# C_Y_real_weights[\"FC3\"],\n", "# C_Y_real_weights[\"FC4\"],\n", "# C_out_Y_real[\"FC1\"],\n", "# C_out_Y_real[\"FC2\"],\n", "# C_out_Y_real[\"FC3\"],\n", "# C_out_Y_real[\"FC4\"],\n", "# C_Y_interp_weights[\"FC1\"],\n", "# C_Y_interp_weights[\"FC2\"],\n", "# C_Y_interp_weights[\"FC3\"],\n", "# C_Y_interp_weights[\"FC4\"],\n", "# C_bwd_Y_interp[\"FC1\"],\n", "# C_bwd_Y_interp[\"FC2\"],\n", "# C_bwd_Y_interp[\"FC3\"],\n", "# C_bwd_Y_interp[\"FC4\"],\n", "# C_out_Y_interp[\"FC1\"],\n", "# C_out_Y_interp[\"FC2\"],\n", "# C_out_Y_interp[\"FC3\"],\n", "# C_out_Y_interp[\"FC4\"],\n", "# C_bwd_W[\"FC1\"],\n", "# C_bwd_W[\"FC1_b\"],\n", "# C_bwd_W[\"FC2\"],\n", "# C_bwd_W[\"FC2_b\"],\n", "# C_bwd_W[\"FC3\"],\n", "# C_bwd_W[\"FC3_b\"],\n", "# C_bwd_W[\"FC4\"],\n", "# C_bwd_W[\"FC4_b\"],\n", "# ],\n", "# feed_dict={X_real: _data},\n", "# )\n", "\n", "# _G_loss = session.run(G_loss, feed_dict={X_real: _data})\n", "\n", "# grads = {\n", "# \"X_real\": _data,\n", "# \"X_interp\": _X_interp,\n", "# \"G_weights_FC1\": _G_weights_FC1,\n", "# \"G_weights_FC2\": _G_weights_FC2,\n", "# \"G_weights_FC3\": _G_weights_FC3,\n", "# \"G_weights_FC4\": _G_weights_FC4,\n", "# \"G_fwd_X_fake_FC1\": _G_fwd_X_fake_FC1,\n", "# \"G_fwd_X_fake_FC2\": _G_fwd_X_fake_FC2,\n", "# \"G_fwd_X_fake_FC3\": _G_fwd_X_fake_FC3,\n", "# \"G_fwd_X_fake_FC4\": _G_fwd_X_fake_FC4,\n", "# \"X_fake\": _X_fake,\n", "# \"C_weights_Y_fake_FC1\": _C_weights_Y_fake_FC1,\n", "# \"C_weights_Y_fake_FC2\": _C_weights_Y_fake_FC2,\n", "# \"C_weights_Y_fake_FC3\": _C_weights_Y_fake_FC3,\n", "# \"C_weights_Y_fake_FC4\": _C_weights_Y_fake_FC4,\n", "# \"C_fwd_Y_fake_FC1\": _C_fwd_Y_fake_FC1,\n", "# \"C_fwd_Y_fake_FC2\": _C_fwd_Y_fake_FC2,\n", "# \"C_fwd_Y_fake_FC3\": _C_fwd_Y_fake_FC3,\n", "# \"C_fwd_Y_fake_FC4\": _C_fwd_Y_fake_FC4,\n", "# \"Y_fake\": _Y_fake,\n", "# \"C_weights_Y_real_FC1\": _C_weights_Y_real_FC1,\n", "# \"C_weights_Y_real_FC2\": _C_weights_Y_real_FC2,\n", "# \"C_weights_Y_real_FC3\": _C_weights_Y_real_FC3,\n", "# \"C_weights_Y_real_FC4\": _C_weights_Y_real_FC4,\n", "# \"C_fwd_Y_real_FC1\": _C_fwd_Y_real_FC1,\n", "# \"C_fwd_Y_real_FC2\": _C_fwd_Y_real_FC2,\n", "# \"C_fwd_Y_real_FC3\": _C_fwd_Y_real_FC3,\n", "# \"C_fwd_Y_real_FC4\": _C_fwd_Y_real_FC4,\n", "# \"Y_real\": _Y_real,\n", "# \"C_weights_Y_interp_FC1\": _C_weights_Y_interp_FC1,\n", "# \"C_weights_Y_interp_FC2\": _C_weights_Y_interp_FC2,\n", "# \"C_weights_Y_interp_FC3\": _C_weights_Y_interp_FC3,\n", "# \"C_weights_Y_interp_FC4\": _C_weights_Y_interp_FC4,\n", "# \"C_fwd_Y_interp_FC1\": _C_fwd_Y_interp_FC1,\n", "# \"C_fwd_Y_interp_FC2\": _C_fwd_Y_interp_FC2,\n", "# \"C_fwd_Y_interp_FC3\": _C_fwd_Y_interp_FC3,\n", "# \"C_fwd_Y_interp_FC4\": _C_fwd_Y_interp_FC4,\n", "# \"Y_interp\": _Y_interp,\n", "# \"dY_interp_wrt_FC1\": _C_dY_interp_wrt_FC1,\n", "# \"dY_interp_wrt_FC2\": _C_dY_interp_wrt_FC2,\n", "# \"dY_interp_wrt_FC3\": _C_dY_interp_wrt_FC3,\n", "# \"dY_interp_wrt_FC4\": _C_dY_interp_wrt_FC4,\n", "# \"gradInterp\": _gradInterp,\n", "# \"gradInterp_norm\": _norm_gradInterp,\n", "# \"G_loss\": _G_loss,\n", "# \"C_loss\": _C_loss,\n", "# \"dC_loss_dW_FC1\": _C_dW_FC1,\n", "# \"dC_loss_db_FC1\": _C_db_FC1,\n", "# \"dC_loss_dW_FC2\": _C_dW_FC2,\n", "# \"dC_loss_db_FC2\": _C_db_FC2,\n", "# \"dC_loss_dW_FC3\": _C_dW_FC3,\n", "# \"dC_loss_db_FC3\": _C_db_FC3,\n", "# \"dC_loss_dW_FC4\": _C_dW_FC4,\n", "# \"dC_loss_db_FC4\": _C_db_FC4,\n", "# \"dC_Y_fake\": _dC_Y_fake,\n", "# \"dC_Y_real\": _dC_Y_real,\n", "# \"dC_gradInterp\": _dC_gradInterp,\n", "# \"dG_Y_fake\": _dG_Y_fake,\n", "# }\n", "# return grads\n", "\n", "\n", "# def TFNCELoss(X, target_word, L):\n", "# from tensorflow.python.ops.nn_impl import _compute_sampled_logits\n", "# from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits\n", "\n", "# tf.compat.v1.disable_eager_execution()\n", "\n", "# in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape)\n", "# in_bias = tf.compat.v1.placeholder(\n", "# tf.float32, shape=L.parameters[\"b\"].flatten().shape\n", "# )\n", "# in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.parameters[\"W\"].shape)\n", "# in_target_word = tf.compat.v1.placeholder(tf.int64)\n", "# in_neg_samples = tf.compat.v1.placeholder(tf.int32)\n", "# in_target_prob = tf.compat.v1.placeholder(tf.float32)\n", "# in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32)\n", "\n", "# # in_embed = tf.keras.Input(dtype=tf.float32, shape=X.shape)\n", "# # in_bias = tf.keras.Input(dtype=tf.float32, shape=L.parameters[\"b\"].flatten().shape)\n", "# # in_weights = tf.keras.Input(dtype=tf.float32, shape=L.parameters[\"W\"].shape)\n", "# # in_target_word = tf.keras.Input(dtype=tf.int64, shape=())\n", "# # in_neg_samples = tf.keras.Input(dtype=tf.int32, shape=())\n", "# # in_target_prob = tf.keras.Input(dtype=tf.float32, shape=())\n", "# # in_neg_samp_prob = tf.keras.Input(dtype=tf.float32, shape=())\n", "\n", "# feed = {\n", "# in_embed: X,\n", "# in_weights: L.parameters[\"W\"],\n", "# in_target_word: target_word,\n", "# in_bias: L.parameters[\"b\"].flatten(),\n", "# in_neg_samples: L.derived_variables[\"noise_samples\"][0],\n", "# in_target_prob: L.derived_variables[\"noise_samples\"][1],\n", "# in_neg_samp_prob: L.derived_variables[\"noise_samples\"][2],\n", "# }\n", "\n", "# # Compute the NCE loss, using a sample of the negative labels each time.\n", "# nce_unreduced = tf.nn.nce_loss(\n", "# weights=in_weights,\n", "# biases=in_bias,\n", "# labels=in_target_word,\n", "# inputs=in_embed,\n", "# sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),\n", "# num_sampled=L.num_negative_samples,\n", "# num_classes=L.n_classes,\n", "# )\n", "\n", "# loss = tf.reduce_sum(nce_unreduced)\n", "# dLdW = tf.gradients(loss, [in_weights])[0]\n", "# dLdb = tf.gradients(loss, [in_bias])[0]\n", "# dLdX = tf.gradients(loss, [in_embed])[0]\n", "\n", "# sampled_logits, sampled_labels = _compute_sampled_logits(\n", "# weights=in_weights,\n", "# biases=in_bias,\n", "# labels=in_target_word,\n", "# inputs=in_embed,\n", "# sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),\n", "# num_sampled=L.num_negative_samples,\n", "# num_classes=L.n_classes,\n", "# num_true=1,\n", "# subtract_log_q=True,\n", "# )\n", "\n", "# sampled_losses = sigmoid_cross_entropy_with_logits(\n", "# labels=sampled_labels, logits=sampled_logits\n", "# )\n", "\n", "# with tf.compat.v1.Session() as session:\n", "# session.run(tf.compat.v1.global_variables_initializer())\n", "# (\n", "# _final_loss,\n", "# _nce_unreduced,\n", "# _dLdW,\n", "# _dLdb,\n", "# _dLdX,\n", "# _sampled_logits,\n", "# _sampled_labels,\n", "# _sampled_losses,\n", "# ) = session.run(\n", "# [\n", "# loss,\n", "# nce_unreduced,\n", "# dLdW,\n", "# dLdb,\n", "# dLdX,\n", "# sampled_logits,\n", "# sampled_labels,\n", "# sampled_losses,\n", "# ],\n", "# feed_dict=feed,\n", "# )\n", "# tf.compat.v1.reset_default_graph()\n", "# return {\n", "# \"final_loss\": _final_loss,\n", "# \"nce_unreduced\": _nce_unreduced,\n", "# \"dLdW\": _dLdW,\n", "# \"dLdb\": _dLdb,\n", "# \"dLdX\": _dLdX,\n", "# \"out_logits\": _sampled_logits,\n", "# \"out_labels\": _sampled_labels,\n", "# \"sampled_loss\": _sampled_losses,\n", "# }\n"]} {"path": "numpy_ml/tests/test_CrossEntropy.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Loss Functions #\n", "#######################################################################\n", "\n", "\n", "def test_cross_entropy(N=15):\n", " from numpy_ml.neural_nets.losses import CrossEntropy\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = CrossEntropy()\n", " gold = log_loss\n", "\n", " # ensure we get 0 when the two arrays are equal\n", " n_classes = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = y_pred = random_one_hot_matrix(n_examples, n_classes)\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))\n", " print(\"PASSED\")\n", "\n", " # test on random inputs\n", " i = 1\n", " while i < N:\n", " n_classes = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = random_one_hot_matrix(n_examples, n_classes)\n", " y_pred = random_stochastic_matrix(n_examples, n_classes)\n", "\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred, normalize=False))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_cross_entropy_grad(N=15):\n", " from numpy_ml.neural_nets.losses import CrossEntropy\n", " from numpy_ml.neural_nets.layers import Softmax\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = CrossEntropy()\n", " gold = torch_xe_grad\n", " sm = Softmax()\n", "\n", " i = 1\n", " while i < N:\n", " n_classes = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", "\n", " y = random_one_hot_matrix(n_examples, n_classes)\n", "\n", " # the cross_entropy_gradient returns the gradient wrt. z (NOT softmax(z))\n", " z = random_tensor((n_examples, n_classes))\n", " y_pred = sm.forward(z)\n", "\n", " assert_almost_equal(mine.grad(y, y_pred), gold(y, z), decimal=5)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_GeneralizedLinearModel.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import statsmodels.api as sm\n", "from numpy_ml.linear_models import GeneralizedLinearModel\n", "from numpy_ml.linear_models.glm import _GLM_LINKS\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def test_glm(N=20):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_samples = np.random.randint(10, 100)\n", "\n", " # n_feats << n_samples to avoid perfect separation / multiple solutions\n", " n_feats = np.random.randint(1, 1 + n_samples // 2)\n", " target_dim = 1\n", "\n", " fit_intercept = np.random.choice([True, False])\n", " _link = np.random.choice(list(_GLM_LINKS.keys()))\n", "\n", " families = {\n", " \"identity\": sm.families.Gaussian(),\n", " \"logit\": sm.families.Binomial(),\n", " \"log\": sm.families.Poisson(),\n", " }\n", "\n", " print(f\"Link: {_link}\")\n", " print(f\"Fit intercept: {fit_intercept}\")\n", "\n", " X = random_tensor((n_samples, n_feats), standardize=True)\n", " if _link == \"logit\":\n", " y = np.random.choice([0.0, 1.0], size=(n_samples, target_dim))\n", " elif _link == \"log\":\n", " y = np.random.choice(np.arange(0, 100), size=(n_samples, target_dim))\n", " elif _link == \"identity\":\n", " y = random_tensor((n_samples, target_dim), standardize=True)\n", " else:\n", " raise ValueError(f\"Unknown link function {_link}\")\n", "\n", " # Fit gold standard model on the entire dataset\n", " fam = families[_link]\n", " Xdesign = np.c_[np.ones(X.shape[0]), X] if fit_intercept else X\n", "\n", " glm_gold = sm.GLM(y, Xdesign, family=fam)\n", " glm_gold = glm_gold.fit()\n", "\n", " glm_mine = GeneralizedLinearModel(link=_link, fit_intercept=fit_intercept)\n", " glm_mine.fit(X, y)\n", "\n", " # check that model coefficients match\n", " beta = glm_mine.beta.T.ravel()\n", " np.testing.assert_almost_equal(beta, glm_gold.params, decimal=6)\n", " print(\"\\t1. Overall model coefficients match\")\n", "\n", " # check that model predictions match\n", " np.testing.assert_almost_equal(\n", " glm_mine.predict(X), glm_gold.predict(Xdesign), decimal=5\n", " )\n", " print(\"\\t2. Overall model predictions match\")\n", "\n", " print(\"\\tPASSED\\n\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_manhattan.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def test_manhattan(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = manhattan(x, y)\n", " theirs = scipy.spatial.distance.cityblock(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_DiGraph.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def from_networkx(G_nx):\n", " \"\"\"Convert a networkx graph to my graph representation\"\"\"\n", " V = list(G_nx.nodes)\n", " edges = list(G_nx.edges)\n", " is_weighted = \"weight\" in G_nx[edges[0][0]][edges[0][1]]\n", "\n", " E = []\n", " for e in edges:\n", " if is_weighted:\n", " E.append(Edge(e[0], e[1], G_nx[e[0]][e[1]][\"weight\"]))\n", " else:\n", " E.append(Edge(e[0], e[1]))\n", "\n", " return DiGraph(V, E) if nx.is_directed(G_nx) else UndirectedGraph(V, E)\n", "\n", "\n", "def to_networkx(G):\n", " \"\"\"Convert my graph representation to a networkx graph\"\"\"\n", " G_nx = nx.DiGraph() if G.is_directed else nx.Graph()\n", " V = list(G._V2I.keys())\n", " G_nx.add_nodes_from(V)\n", "\n", " for v in V:\n", " fr_i = G._V2I[v]\n", " edges = G._G[fr_i]\n", "\n", " for edge in edges:\n", " G_nx.add_edge(edge.fr, edge.to, weight=edge._w)\n", " return G_nx\n", "\n", "\n", "def test_random_DAG(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.uniform(0.25, 1)\n", " n_v = np.random.randint(5, 50)\n", "\n", " G = random_DAG(n_v, p)\n", " G_nx = to_networkx(G)\n", "\n", " assert nx.is_directed_acyclic_graph(G_nx)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_topological_ordering(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.uniform(0.25, 1)\n", " n_v = np.random.randint(5, 10)\n", "\n", " G = random_DAG(n_v, p)\n", " G_nx = to_networkx(G)\n", "\n", " if nx.is_directed_acyclic_graph(G_nx):\n", " topo_order = G.topological_ordering()\n", "\n", " # test topological order\n", " seen_it = set()\n", " for n_i in topo_order:\n", " seen_it.add(n_i)\n", " assert any([c_i in seen_it for c_i in G.get_neighbors(n_i)]) == False\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_is_acyclic(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.rand()\n", " directed = np.random.rand() < 0.5\n", " G = random_unweighted_graph(n_vertices=10, edge_prob=p, directed=True)\n", " G_nx = to_networkx(G)\n", "\n", " assert G.is_acyclic() == nx.is_directed_acyclic_graph(G_nx)\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/tests/test_hamming.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "\n", "def test_hamming(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = (np.random.rand(N) * 100).round().astype(int)\n", " y = (np.random.rand(N) * 100).round().astype(int)\n", " mine = hamming(x, y)\n", " theirs = scipy.spatial.distance.hamming(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/err_test_Deconv2D.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "def test_Deconv2D(N=15):\n", " from numpy_ml.neural_nets.layers import Deconv2D\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_shape = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " p, s = np.random.randint(0, 5), np.random.randint(1, 3)\n", "\n", " out_rows = s * (in_rows - 1) - 2 * p + f_shape[0]\n", " out_cols = s * (in_cols - 1) - 2 * p + f_shape[1]\n", "\n", " if out_rows <= 0 or out_cols <= 0:\n", " continue\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Deconv2D layer\n", " L1 = Deconv2D(\n", " out_ch=n_out, kernel_shape=f_shape, act_fn=act_fn, pad=p, stride=s\n", " )\n", "\n", " # forward prop\n", " try:\n", " y_pred = L1.forward(X)\n", " except ValueError:\n", " print(\"Improper dimensions; retrying\")\n", " continue\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchDeconv2DLayer(\n", " n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (y_pred, \"y\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"pad={}, stride={}, f_shape={}, n_ex={}\".format(p, s, f_shape, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"out_rows={}, out_cols={}, n_out={}\".format(out_rows, out_cols, n_out))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_VAELoss.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Loss Functions #\n", "#######################################################################\n", "\n", "\n", "def test_VAE_loss(N=15):\n", " from numpy_ml.neural_nets.losses import VAELoss\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", " eps = np.finfo(float).eps\n", "\n", " i = 1\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " t_dim = np.random.randint(2, 10)\n", " t_mean = random_tensor([n_ex, t_dim], standardize=True)\n", " t_log_var = np.log(np.abs(random_tensor([n_ex, t_dim], standardize=True) + eps))\n", " im_cols, im_rows = np.random.randint(2, 40), np.random.randint(2, 40)\n", " X = np.random.rand(n_ex, im_rows * im_cols)\n", " X_recon = np.random.rand(n_ex, im_rows * im_cols)\n", "\n", " mine = VAELoss()\n", " mine_loss = mine(X, X_recon, t_mean, t_log_var)\n", " dX_recon, dLogVar, dMean = mine.grad(X, X_recon, t_mean, t_log_var)\n", " golds = TorchVAELoss().extract_grads(X, X_recon, t_mean, t_log_var)\n", "\n", " params = [\n", " (mine_loss, \"loss\"),\n", " (dX_recon, \"dX_recon\"),\n", " (dLogVar, \"dt_log_var\"),\n", " (dMean, \"dt_mean\"),\n", " ]\n", " print(\"\\nTrial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " rtol=0.1,\n", " atol=1e-2,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/tests/test_Sigmoid.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_sigmoid_activation(N=50):\n", " from numpy_ml.neural_nets.activations import Sigmoid\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Sigmoid()\n", " gold = expit\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((1, n_dims))\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_sigmoid_grad(N=50):\n", " from numpy_ml.neural_nets.activations import Sigmoid\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Sigmoid()\n", " gold = torch_gradient_generator(torch.sigmoid)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/tests/test_SkipConnectionIdentityModule.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " # TFNCELoss,\n", " # WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "\n", "\n", "def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):\n", " \"\"\"\n", " Manual gradient calc for vanilla RNN parameters\n", " \"\"\"\n", " if param_name in [\"Ba\", \"Bx\"]:\n", " param_name = param_name.lower()\n", " elif param_name in [\"X\", \"y\"]:\n", " return None\n", "\n", " param_orig = model.parameters[param_name]\n", " model.flush_gradients()\n", " grads = np.zeros_like(param_orig)\n", "\n", " for flat_ix, val in enumerate(param_orig.flat):\n", " param = deepcopy(param_orig)\n", " md_ix = np.unravel_index(flat_ix, param.shape)\n", "\n", " # plus\n", " y_preds_plus = []\n", " param[md_ix] = val + epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_plus = model.forward(X[:, :, t])\n", " y_preds_plus += [y_pred_plus]\n", " loss_plus = loss_func(y_preds_plus)\n", " model.flush_gradients()\n", "\n", " # minus\n", " y_preds_minus = []\n", " param[md_ix] = val - epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_minus = model.forward(X[:, :, t])\n", " y_preds_minus += [y_pred_minus]\n", " loss_minus = loss_func(y_preds_minus)\n", " model.flush_gradients()\n", "\n", " grad = (loss_plus - loss_minus) / (2 * epsilon)\n", " grads[md_ix] = grad\n", " return grads.T\n", "\n", "\n", "#######################################################################\n", "# Modules #\n", "#######################################################################\n", "\n", "\n", "\n", "def test_SkipConnectionIdentityModule(N=15):\n", " from numpy_ml.neural_nets.modules import SkipConnectionIdentityModule\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(2, 25)\n", " in_cols = np.random.randint(2, 25)\n", " n_in = np.random.randint(2, 5)\n", " n_out = n_in\n", " f_shape1 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " f_shape2 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " s1 = np.random.randint(1, 5)\n", " s2 = np.random.randint(1, 5)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " p1 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape1, s1)\n", " if p1[0] != p1[1] or p1[2] != p1[3]:\n", " continue\n", "\n", " p2 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape2, s2)\n", " if p2[0] != p2[1] or p2[2] != p2[3]:\n", " continue\n", "\n", " p1 = (p1[0], p1[2])\n", " p2 = (p2[0], p2[2])\n", "\n", " # initialize SkipConnectionIdentity module\n", " L1 = SkipConnectionIdentityModule(\n", " out_ch=n_out,\n", " kernel_shape1=f_shape1,\n", " kernel_shape2=f_shape2,\n", " stride1=s1,\n", " stride2=s2,\n", " act_fn=act_fn,\n", " epsilon=1e-5,\n", " momentum=0.9,\n", " )\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchSkipConnectionIdentity(\n", " torch_fn,\n", " p1,\n", " p2,\n", " L1.parameters,\n", " L1.hyperparameters,\n", " momentum=L1.momentum,\n", " epsilon=L1.epsilon,\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = L1.parameters[\"components\"]\n", " grads = L1.gradients[\"components\"]\n", " params = [\n", " (X, \"X\"),\n", " (params[\"conv1\"][\"W\"], \"conv1_W\"),\n", " (params[\"conv1\"][\"b\"], \"conv1_b\"),\n", " (params[\"batchnorm1\"][\"scaler\"].T, \"bn1_scaler\"),\n", " (params[\"batchnorm1\"][\"intercept\"], \"bn1_intercept\"),\n", " (params[\"batchnorm1\"][\"running_mean\"], \"bn1_running_mean\"),\n", " # (params[\"batchnorm1\"][\"running_var\"], \"bn1_running_var\"),\n", " (params[\"conv2\"][\"W\"], \"conv2_W\"),\n", " (params[\"conv2\"][\"b\"], \"conv2_b\"),\n", " (params[\"batchnorm2\"][\"scaler\"].T, \"bn2_scaler\"),\n", " (params[\"batchnorm2\"][\"intercept\"], \"bn2_intercept\"),\n", " (params[\"batchnorm2\"][\"running_mean\"], \"bn2_running_mean\"),\n", " # (params[\"batchnorm2\"][\"running_var\"], \"bn2_running_var\"),\n", " (L1._dv[\"conv1_out\"], \"act1_out\"),\n", " (L1._dv[\"batchnorm1_out\"], \"bn1_out\"),\n", " (L1._dv[\"conv2_out\"], \"conv2_out\"),\n", " (L1._dv[\"batchnorm2_out\"], \"bn2_out\"),\n", " (y_pred, \"Y\"),\n", " (dLdy, \"dLdY\"),\n", " (L1.derived_variables[\"dLdBn2\"], \"dLdBn2_out\"),\n", " (L1.derived_variables[\"dLdConv2\"], \"dLdConv2_out\"),\n", " (L1.derived_variables[\"dLdBn1\"], \"dLdBn1_out\"),\n", " (L1.derived_variables[\"dLdConv1\"], \"dLdActFn1_out\"),\n", " (dLdX, \"dLdX\"),\n", " (grads[\"batchnorm2\"][\"scaler\"].T, \"dLdBn2_scaler\"),\n", " (grads[\"batchnorm2\"][\"intercept\"], \"dLdBn2_intercept\"),\n", " (grads[\"conv2\"][\"W\"], \"dLdConv2_W\"),\n", " (grads[\"conv2\"][\"b\"], \"dLdConv2_b\"),\n", " (grads[\"batchnorm1\"][\"scaler\"].T, \"dLdBn1_scaler\"),\n", " (grads[\"batchnorm1\"][\"intercept\"], \"dLdBn1_intercept\"),\n", " (grads[\"conv1\"][\"W\"], \"dLdConv1_W\"),\n", " (grads[\"conv1\"][\"b\"], \"dLdConv1_b\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"act_fn={}, n_ex={}\".format(act_fn, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"pad1={}, stride1={}, f_shape1={}\".format(p1, s1, f_shape1))\n", " print(\"pad2={}, stride2={}, f_shape2={}\".format(p2, s2, f_shape2))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=2\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n"]} {"path": "numpy_ml/o_tests/test_utils.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import scipy\n", "import networkx as nx\n", "\n", "from sklearn.neighbors import BallTree as sk_BallTree\n", "from sklearn.metrics.pairwise import rbf_kernel as sk_rbf\n", "from sklearn.metrics.pairwise import linear_kernel as sk_linear\n", "from sklearn.metrics.pairwise import polynomial_kernel as sk_poly\n", "\n", "\n", "from numpy_ml.utils.distance_metrics import (\n", " hamming,\n", " euclidean,\n", " chebyshev,\n", " manhattan,\n", " minkowski,\n", ")\n", "from numpy_ml.utils.kernels import LinearKernel, PolynomialKernel, RBFKernel\n", "from numpy_ml.utils.data_structures import BallTree\n", "from numpy_ml.utils.graphs import (\n", " Edge,\n", " DiGraph,\n", " UndirectedGraph,\n", " random_DAG,\n", " random_unweighted_graph,\n", ")\n", "\n", "#######################################################################\n", "# Kernels #\n", "#######################################################################\n", "\n", "\n", "def test_linear_kernel(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " M = np.random.randint(1, 100)\n", " C = np.random.randint(1, 1000)\n", "\n", " X = np.random.rand(N, C)\n", " Y = np.random.rand(M, C)\n", "\n", " mine = LinearKernel()(X, Y)\n", " gold = sk_linear(X, Y)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_polynomial_kernel(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " M = np.random.randint(1, 100)\n", " C = np.random.randint(1, 1000)\n", " gamma = np.random.rand()\n", " d = np.random.randint(1, 5)\n", " c0 = np.random.rand()\n", "\n", " X = np.random.rand(N, C)\n", " Y = np.random.rand(M, C)\n", "\n", " mine = PolynomialKernel(gamma=gamma, d=d, c0=c0)(X, Y)\n", " gold = sk_poly(X, Y, gamma=gamma, degree=d, coef0=c0)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_radial_basis_kernel(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " M = np.random.randint(1, 100)\n", " C = np.random.randint(1, 1000)\n", " gamma = np.random.rand()\n", "\n", " X = np.random.rand(N, C)\n", " Y = np.random.rand(M, C)\n", "\n", " # sklearn (gamma) <-> mine (sigma) conversion:\n", " # gamma = 1 / (2 * sigma^2)\n", " # sigma = np.sqrt(1 / 2 * gamma)\n", "\n", " mine = RBFKernel(sigma=np.sqrt(1 / (2 * gamma)))(X, Y)\n", " gold = sk_rbf(X, Y, gamma=gamma)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Distance Metrics #\n", "#######################################################################\n", "\n", "\n", "def test_euclidean(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = euclidean(x, y)\n", " theirs = scipy.spatial.distance.euclidean(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_hamming(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = (np.random.rand(N) * 100).round().astype(int)\n", " y = (np.random.rand(N) * 100).round().astype(int)\n", " mine = hamming(x, y)\n", " theirs = scipy.spatial.distance.hamming(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_minkowski(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " p = 1 + np.random.rand() * 10\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = minkowski(x, y, p)\n", " theirs = scipy.spatial.distance.minkowski(x, y, p)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_chebyshev(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = chebyshev(x, y)\n", " theirs = scipy.spatial.distance.chebyshev(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_manhattan(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(1, 100)\n", " x = np.random.rand(N)\n", " y = np.random.rand(N)\n", " mine = manhattan(x, y)\n", " theirs = scipy.spatial.distance.cityblock(x, y)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Data Structures #\n", "#######################################################################\n", "\n", "\n", "def test_ball_tree(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " k = np.random.randint(1, N)\n", " ls = np.min([np.random.randint(1, 10), N - 1])\n", "\n", " X = np.random.rand(N, M)\n", " BT = BallTree(leaf_size=ls, metric=euclidean)\n", " BT.fit(X)\n", "\n", " x = np.random.rand(M)\n", " mine = BT.nearest_neighbors(k, x)\n", " assert len(mine) == k\n", "\n", " mine_neighb = np.array([n.key for n in mine])\n", " mine_dist = np.array([n.distance for n in mine])\n", "\n", " sort_ix = np.argsort(mine_dist)\n", " mine_dist = mine_dist[sort_ix]\n", " mine_neighb = mine_neighb[sort_ix]\n", "\n", " sk = sk_BallTree(X, leaf_size=ls)\n", " theirs_dist, ind = sk.query(x.reshape(1, -1), k=k)\n", " sort_ix = np.argsort(theirs_dist.flatten())\n", "\n", " theirs_dist = theirs_dist.flatten()[sort_ix]\n", " theirs_neighb = X[ind.flatten()[sort_ix]]\n", "\n", " for j in range(len(theirs_dist)):\n", " np.testing.assert_almost_equal(mine_neighb[j], theirs_neighb[j])\n", " np.testing.assert_almost_equal(mine_dist[j], theirs_dist[j])\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Graphs #\n", "#######################################################################\n", "\n", "\n", "def from_networkx(G_nx):\n", " \"\"\"Convert a networkx graph to my graph representation\"\"\"\n", " V = list(G_nx.nodes)\n", " edges = list(G_nx.edges)\n", " is_weighted = \"weight\" in G_nx[edges[0][0]][edges[0][1]]\n", "\n", " E = []\n", " for e in edges:\n", " if is_weighted:\n", " E.append(Edge(e[0], e[1], G_nx[e[0]][e[1]][\"weight\"]))\n", " else:\n", " E.append(Edge(e[0], e[1]))\n", "\n", " return DiGraph(V, E) if nx.is_directed(G_nx) else UndirectedGraph(V, E)\n", "\n", "\n", "def to_networkx(G):\n", " \"\"\"Convert my graph representation to a networkx graph\"\"\"\n", " G_nx = nx.DiGraph() if G.is_directed else nx.Graph()\n", " V = list(G._V2I.keys())\n", " G_nx.add_nodes_from(V)\n", "\n", " for v in V:\n", " fr_i = G._V2I[v]\n", " edges = G._G[fr_i]\n", "\n", " for edge in edges:\n", " G_nx.add_edge(edge.fr, edge.to, weight=edge._w)\n", " return G_nx\n", "\n", "\n", "def test_all_paths(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.rand()\n", " directed = np.random.rand() < 0.5\n", " G = random_unweighted_graph(n_vertices=5, edge_prob=p, directed=directed)\n", "\n", " nodes = G._I2V.keys()\n", " G_nx = to_networkx(G)\n", "\n", " # for each graph, test all_paths for all pairs of start and end\n", " # vertices. note that graph is not guaranteed to be connected, so many\n", " # paths will be empty\n", " for s_i in nodes:\n", " for e_i in nodes:\n", " if s_i == e_i:\n", " continue\n", "\n", " paths = G.all_paths(s_i, e_i)\n", " paths_nx = nx.all_simple_paths(G_nx, source=s_i, target=e_i, cutoff=10)\n", "\n", " paths = sorted(paths)\n", " paths_nx = sorted(list(paths_nx))\n", "\n", " for p1, p2 in zip(paths, paths_nx):\n", " np.testing.assert_array_equal(p1, p2)\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_random_DAG(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.uniform(0.25, 1)\n", " n_v = np.random.randint(5, 50)\n", "\n", " G = random_DAG(n_v, p)\n", " G_nx = to_networkx(G)\n", "\n", " assert nx.is_directed_acyclic_graph(G_nx)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_topological_ordering(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.uniform(0.25, 1)\n", " n_v = np.random.randint(5, 10)\n", "\n", " G = random_DAG(n_v, p)\n", " G_nx = to_networkx(G)\n", "\n", " if nx.is_directed_acyclic_graph(G_nx):\n", " topo_order = G.topological_ordering()\n", "\n", " # test topological order\n", " seen_it = set()\n", " for n_i in topo_order:\n", " seen_it.add(n_i)\n", " assert any([c_i in seen_it for c_i in G.get_neighbors(n_i)]) == False\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_is_acyclic(N=1):\n", " np.random.seed(12345)\n", " i = 0\n", " while i < N:\n", " p = np.random.rand()\n", " directed = np.random.rand() < 0.5\n", " G = random_unweighted_graph(n_vertices=10, edge_prob=p, directed=True)\n", " G_nx = to_networkx(G)\n", "\n", " assert G.is_acyclic() == nx.is_directed_acyclic_graph(G_nx)\n", " print(\"PASSED\")\n", " i += 1"]} {"path": "numpy_ml/o_tests/test_nonparametric.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier\n", "from sklearn.gaussian_process import GaussianProcessRegressor\n", "\n", "from numpy_ml.nonparametric.knn import KNN\n", "from numpy_ml.nonparametric.gp import GPRegression\n", "from numpy_ml.utils.distance_metrics import euclidean\n", "\n", "\n", "def test_knn_regression(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " k = np.random.randint(1, N)\n", " ls = np.min([np.random.randint(1, 10), N - 1])\n", " weights = np.random.choice([\"uniform\", \"distance\"])\n", "\n", " X = np.random.rand(N, M)\n", " X_test = np.random.rand(N, M)\n", " y = np.random.rand(N)\n", "\n", " knn = KNN(\n", " k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights\n", " )\n", " knn.fit(X, y)\n", " preds = knn.predict(X_test)\n", "\n", " gold = KNeighborsRegressor(\n", " p=2,\n", " leaf_size=ls,\n", " n_neighbors=k,\n", " weights=weights,\n", " metric=\"minkowski\",\n", " algorithm=\"ball_tree\",\n", " )\n", " gold.fit(X, y)\n", " gold_preds = gold.predict(X_test)\n", "\n", " for mine, theirs in zip(preds, gold_preds):\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_knn_clf(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " k = np.random.randint(1, N)\n", " n_classes = np.random.randint(2, 10)\n", " ls = np.min([np.random.randint(1, 10), N - 1])\n", " weights = \"uniform\"\n", "\n", " X = np.random.rand(N, M)\n", " X_test = np.random.rand(N, M)\n", " y = np.random.randint(0, n_classes, size=N)\n", "\n", " knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights)\n", " knn.fit(X, y)\n", " preds = knn.predict(X_test)\n", "\n", " gold = KNeighborsClassifier(\n", " p=2,\n", " metric=\"minkowski\",\n", " leaf_size=ls,\n", " n_neighbors=k,\n", " weights=weights,\n", " algorithm=\"ball_tree\",\n", " )\n", " gold.fit(X, y)\n", " gold_preds = gold.predict(X_test)\n", "\n", " for mine, theirs in zip(preds, gold_preds):\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_gp_regression(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " alpha = np.random.rand()\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " K = np.random.randint(1, N)\n", " J = np.random.randint(1, 3)\n", "\n", " X = np.random.rand(N, M)\n", " y = np.random.rand(N, J)\n", " X_test = np.random.rand(K, M)\n", "\n", " gp = GPRegression(kernel=\"RBFKernel(sigma=1)\", alpha=alpha)\n", " gold = GaussianProcessRegressor(\n", " kernel=None, alpha=alpha, optimizer=None, normalize_y=False\n", " )\n", "\n", " gp.fit(X, y)\n", " gold.fit(X, y)\n", "\n", " preds, _ = gp.predict(X_test)\n", " gold_preds = gold.predict(X_test)\n", " np.testing.assert_almost_equal(preds.reshape(-1), gold_preds.reshape(-1))\n", "\n", " mll = gp.marginal_log_likelihood().reshape(-1)\n", " gold_mll = gold.log_marginal_likelihood().reshape(-1)\n", " np.testing.assert_almost_equal(mll, gold_mll)\n", "\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/test_linear_regression.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.linear_model import LinearRegression as LinearRegressionGold\n", "\n", "from numpy_ml.linear_models import LinearRegression\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def test_linear_regression(N=10):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " train_samples = np.random.randint(2, 30)\n", " update_samples = np.random.randint(1, 30)\n", " n_samples = train_samples + update_samples\n", "\n", " # ensure n_feats < train_samples, otherwise multiple solutions are\n", " # possible\n", " n_feats = np.random.randint(1, train_samples)\n", " target_dim = np.random.randint(1, 10)\n", "\n", " fit_intercept = np.random.choice([True, False])\n", "\n", " X = random_tensor((n_samples, n_feats), standardize=True)\n", " y = random_tensor((n_samples, target_dim), standardize=True)\n", "\n", " weighted = np.random.choice([True, False])\n", " weights = np.random.rand(n_samples) if weighted else np.ones(n_samples)\n", "\n", " X_train, X_update = X[:train_samples], X[train_samples:]\n", " y_train, y_update = y[:train_samples], y[train_samples:]\n", " w_train, w_update = weights[:train_samples], weights[train_samples:]\n", "\n", " print(f\"Weights: {weighted}\")\n", " print(f\"Fit intercept: {fit_intercept}\")\n", "\n", " # Fit gold standard model on the entire dataset\n", " lr_gold = LinearRegressionGold(fit_intercept=fit_intercept, normalize=False)\n", " lr_gold.fit(X, y, sample_weight=weights)\n", "\n", " lr_mine = LinearRegression(fit_intercept=fit_intercept)\n", " lr_mine.fit(X, y, weights=weights)\n", "\n", " # check that model predictions match\n", " np.testing.assert_almost_equal(\n", " lr_mine.predict(X), lr_gold.predict(X), decimal=5\n", " )\n", " print(\"\\t1. Overall model predictions match\")\n", "\n", " # check that model coefficients match\n", " beta = lr_mine.beta.T[:, 1:] if fit_intercept else lr_mine.beta.T\n", " np.testing.assert_almost_equal(beta, lr_gold.coef_, decimal=6)\n", " print(\"\\t2. Overall model coefficients match\")\n", "\n", " # Fit our model on just (X_train, y_train)...\n", " lr = LinearRegression(fit_intercept=fit_intercept)\n", " lr.fit(X_train, y_train, weights=w_train)\n", "\n", " do_single_sample_update = np.random.choice([True, False])\n", "\n", " # ...then update our model on the examples (X_update, y_update)\n", " if do_single_sample_update:\n", " for x_new, y_new, w_new in zip(X_update, y_update, w_update):\n", " lr.update(x_new, y_new, w_new)\n", " else:\n", " lr.update(X_update, y_update, w_update)\n", "\n", " # check that model predictions match\n", " np.testing.assert_almost_equal(lr.predict(X), lr_gold.predict(X), decimal=5)\n", " print(\"\\t3. Iterative model predictions match\")\n", "\n", " # check that model coefficients match\n", " beta = lr.beta.T[:, 1:] if fit_intercept else lr.beta.T\n", " np.testing.assert_almost_equal(beta, lr_gold.coef_, decimal=6)\n", " print(\"\\t4. Iterative model coefficients match\")\n", "\n", " print(\"\\tPASSED\\n\")\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/test_glm.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import statsmodels.api as sm\n", "from numpy_ml.linear_models import GeneralizedLinearModel\n", "from numpy_ml.linear_models.glm import _GLM_LINKS\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def test_glm(N=20):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_samples = np.random.randint(10, 100)\n", "\n", " # n_feats << n_samples to avoid perfect separation / multiple solutions\n", " n_feats = np.random.randint(1, 1 + n_samples // 2)\n", " target_dim = 1\n", "\n", " fit_intercept = np.random.choice([True, False])\n", " _link = np.random.choice(list(_GLM_LINKS.keys()))\n", "\n", " families = {\n", " \"identity\": sm.families.Gaussian(),\n", " \"logit\": sm.families.Binomial(),\n", " \"log\": sm.families.Poisson(),\n", " }\n", "\n", " print(f\"Link: {_link}\")\n", " print(f\"Fit intercept: {fit_intercept}\")\n", "\n", " X = random_tensor((n_samples, n_feats), standardize=True)\n", " if _link == \"logit\":\n", " y = np.random.choice([0.0, 1.0], size=(n_samples, target_dim))\n", " elif _link == \"log\":\n", " y = np.random.choice(np.arange(0, 100), size=(n_samples, target_dim))\n", " elif _link == \"identity\":\n", " y = random_tensor((n_samples, target_dim), standardize=True)\n", " else:\n", " raise ValueError(f\"Unknown link function {_link}\")\n", "\n", " # Fit gold standard model on the entire dataset\n", " fam = families[_link]\n", " Xdesign = np.c_[np.ones(X.shape[0]), X] if fit_intercept else X\n", "\n", " glm_gold = sm.GLM(y, Xdesign, family=fam)\n", " glm_gold = glm_gold.fit()\n", "\n", " glm_mine = GeneralizedLinearModel(link=_link, fit_intercept=fit_intercept)\n", " glm_mine.fit(X, y)\n", "\n", " # check that model coefficients match\n", " beta = glm_mine.beta.T.ravel()\n", " np.testing.assert_almost_equal(beta, glm_gold.params, decimal=6)\n", " print(\"\\t1. Overall model coefficients match\")\n", "\n", " # check that model predictions match\n", " np.testing.assert_almost_equal(\n", " glm_mine.predict(X), glm_gold.predict(Xdesign), decimal=5\n", " )\n", " print(\"\\t2. Overall model predictions match\")\n", "\n", " print(\"\\tPASSED\\n\")\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/__init__.py", "content": ["\"\"\"Unit tests for various numpy-ml modules\"\"\"\n"]} {"path": "numpy_ml/o_tests/test_nn.py", "content": ["# flake8: noqa\n", "import time\n", "from copy import deepcopy\n", "\n", "import numpy as np\n", "from numpy.testing import assert_almost_equal\n", "\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "\n", "# for testing sigmoid\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.neural_nets.utils import (\n", " calc_pad_dims_2D,\n", " conv2D_naive,\n", " conv2D,\n", " pad2D,\n", " pad1D,\n", ")\n", "from numpy_ml.utils.testing import (\n", " random_one_hot_matrix,\n", " random_stochastic_matrix,\n", " random_tensor,\n", ")\n", "\n", "from .nn_torch_models import (\n", " TFNCELoss,\n", " WGAN_GP_tf,\n", " torch_xe_grad,\n", " torch_mse_grad,\n", " TorchVAELoss,\n", " TorchFCLayer,\n", " TorchRNNCell,\n", " TorchLSTMCell,\n", " TorchAddLayer,\n", " TorchWGANGPLoss,\n", " TorchConv1DLayer,\n", " TorchConv2DLayer,\n", " TorchPool2DLayer,\n", " TorchWavenetModule,\n", " TorchMultiplyLayer,\n", " TorchDeconv2DLayer,\n", " TorchLayerNormLayer,\n", " TorchBatchNormLayer,\n", " TorchEmbeddingLayer,\n", " TorchLinearActivation,\n", " TorchSDPAttentionLayer,\n", " TorchBidirectionalLSTM,\n", " torch_gradient_generator,\n", " TorchSkipConnectionConv,\n", " TorchSkipConnectionIdentity,\n", " TorchMultiHeadedAttentionModule,\n", ")\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Loss Functions #\n", "#######################################################################\n", "\n", "\n", "def test_squared_error(N=15):\n", " from numpy_ml.neural_nets.losses import SquaredError\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SquaredError()\n", " gold = (\n", " lambda y, y_pred: mean_squared_error(y, y_pred)\n", " * y_pred.shape[0]\n", " * y_pred.shape[1]\n", " * 0.5\n", " )\n", "\n", " # ensure we get 0 when the two arrays are equal\n", " n_dims = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = y_pred = random_tensor((n_examples, n_dims))\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))\n", " print(\"PASSED\")\n", "\n", " i = 1\n", " while i < N:\n", " n_dims = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = random_tensor((n_examples, n_dims))\n", " y_pred = random_tensor((n_examples, n_dims))\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred), decimal=5)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_cross_entropy(N=15):\n", " from numpy_ml.neural_nets.losses import CrossEntropy\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = CrossEntropy()\n", " gold = log_loss\n", "\n", " # ensure we get 0 when the two arrays are equal\n", " n_classes = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = y_pred = random_one_hot_matrix(n_examples, n_classes)\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred))\n", " print(\"PASSED\")\n", "\n", " # test on random inputs\n", " i = 1\n", " while i < N:\n", " n_classes = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = random_one_hot_matrix(n_examples, n_classes)\n", " y_pred = random_stochastic_matrix(n_examples, n_classes)\n", "\n", " assert_almost_equal(mine.loss(y, y_pred), gold(y, y_pred, normalize=False))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_VAE_loss(N=15):\n", " from numpy_ml.neural_nets.losses import VAELoss\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", " eps = np.finfo(float).eps\n", "\n", " i = 1\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " t_dim = np.random.randint(2, 10)\n", " t_mean = random_tensor([n_ex, t_dim], standardize=True)\n", " t_log_var = np.log(np.abs(random_tensor([n_ex, t_dim], standardize=True) + eps))\n", " im_cols, im_rows = np.random.randint(2, 40), np.random.randint(2, 40)\n", " X = np.random.rand(n_ex, im_rows * im_cols)\n", " X_recon = np.random.rand(n_ex, im_rows * im_cols)\n", "\n", " mine = VAELoss()\n", " mine_loss = mine(X, X_recon, t_mean, t_log_var)\n", " dX_recon, dLogVar, dMean = mine.grad(X, X_recon, t_mean, t_log_var)\n", " golds = TorchVAELoss().extract_grads(X, X_recon, t_mean, t_log_var)\n", "\n", " params = [\n", " (mine_loss, \"loss\"),\n", " (dX_recon, \"dX_recon\"),\n", " (dLogVar, \"dt_log_var\"),\n", " (dMean, \"dt_mean\"),\n", " ]\n", " print(\"\\nTrial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " rtol=0.1,\n", " atol=1e-2,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_WGAN_GP_loss(N=5):\n", " from numpy_ml.neural_nets.losses import WGAN_GPLoss\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N:\n", " lambda_ = np.random.randint(0, 10)\n", " n_ex = np.random.randint(1, 10)\n", " n_feats = np.random.randint(2, 10)\n", " Y_real = random_tensor([n_ex], standardize=True)\n", " Y_fake = random_tensor([n_ex], standardize=True)\n", " gradInterp = random_tensor([n_ex, n_feats], standardize=True)\n", "\n", " mine = WGAN_GPLoss(lambda_=lambda_)\n", " C_loss = mine(Y_fake, \"C\", Y_real, gradInterp)\n", " G_loss = mine(Y_fake, \"G\")\n", "\n", " C_dY_fake, dY_real, dGradInterp = mine.grad(Y_fake, \"C\", Y_real, gradInterp)\n", " G_dY_fake = mine.grad(Y_fake, \"G\")\n", "\n", " golds = TorchWGANGPLoss(lambda_).extract_grads(Y_real, Y_fake, gradInterp)\n", " if np.isnan(golds[\"C_dGradInterp\"]).any():\n", " continue\n", "\n", " params = [\n", " (Y_real, \"Y_real\"),\n", " (Y_fake, \"Y_fake\"),\n", " (gradInterp, \"gradInterp\"),\n", " (C_loss, \"C_loss\"),\n", " (G_loss, \"G_loss\"),\n", " (-dY_real, \"C_dY_real\"),\n", " (-C_dY_fake, \"C_dY_fake\"),\n", " (dGradInterp, \"C_dGradInterp\"),\n", " (G_dY_fake, \"G_dY_fake\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " rtol=0.1,\n", " atol=1e-2,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_NCELoss(N=1):\n", " from numpy_ml.neural_nets.losses import NCELoss\n", " from numpy_ml.utils.data_structures import DiscreteSampler\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_c = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 300)\n", " vocab_size = np.random.randint(200, 1000)\n", " num_negative_samples = np.random.randint(1, 10)\n", "\n", " embeddings = random_tensor((n_ex, n_c, n_out), standardize=True)\n", " target = np.random.randint(0, vocab_size, (n_ex, 1))\n", "\n", " probs = np.random.rand(vocab_size)\n", " probs /= probs.sum()\n", "\n", " D = DiscreteSampler(probs, log=False, with_replacement=False)\n", " NCE = NCELoss(vocab_size, D, num_negative_samples)\n", " my_loss, _ = NCE(embeddings, target.flatten())\n", "\n", " my_dLdX = NCE.grad(update_params=False)\n", " my_dLdW = NCE.gradients[\"W\"]\n", " my_dLdb = NCE.gradients[\"b\"]\n", "\n", " NCE.gradients[\"W\"] = np.zeros_like(NCE.parameters[\"W\"])\n", " NCE.gradients[\"b\"] = np.zeros_like(NCE.parameters[\"b\"])\n", "\n", " MY_final_loss, TF_final_loss = 0, 0\n", " MY_dLdX, TF_dLdX = np.zeros_like(embeddings), np.zeros_like(embeddings)\n", " TF_dLdW, TF_dLdb = (\n", " np.zeros_like(NCE.parameters[\"W\"]),\n", " np.zeros_like(NCE.parameters[\"b\"]),\n", " )\n", "\n", " # XXX: instead of calculating the tf NCE on the entire batch, we\n", " # calculate it per-example and then sum. this is really lame and should\n", " # be changed to operate on batches.\n", " nv = NCE.derived_variables[\"noise_samples\"][0]\n", " for ix, emb in enumerate(embeddings):\n", " sv = (nv[0], np.array([nv[1][0, ix]]), nv[2])\n", "\n", " NCE.X = []\n", " for k, v in NCE.derived_variables.items():\n", " NCE.derived_variables[k] = []\n", "\n", " for k, v in NCE.gradients.items():\n", " NCE.gradients[k] = np.zeros_like(v)\n", "\n", " my, _ = NCE(emb[None, :, :], target[ix], neg_samples=sv[0])\n", "\n", " NCE.derived_variables[\"noise_samples\"] = [sv]\n", " dldx = NCE.grad(update_params=False)\n", " NCE.derived_variables[\"noise_samples\"] = sv\n", "\n", " MY_final_loss += my\n", " MY_dLdX[ix, ...] += np.squeeze(dldx, axis=0)\n", "\n", " TF_dict = TFNCELoss(emb, np.array([target[ix]]), NCE)\n", "\n", " TF_loss = TF_dict[\"final_loss\"]\n", " TF_final_loss += TF_loss\n", " TF_dLdX[ix, ...] += TF_dict[\"dLdX\"]\n", " TF_dLdW[TF_dict[\"dLdW\"].indices, :] += TF_dict[\"dLdW\"].values\n", " TF_dLdb[:, TF_dict[\"dLdb\"].indices] += TF_dict[\"dLdb\"].values\n", "\n", " tf_dw = np.zeros_like(NCE.gradients[\"W\"])\n", " tf_dw[TF_dict[\"dLdW\"].indices, :] += TF_dict[\"dLdW\"].values\n", "\n", " tf_db = np.zeros_like(NCE.gradients[\"b\"])\n", " tf_db[:, TF_dict[\"dLdb\"].indices] += TF_dict[\"dLdb\"].values\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " np.testing.assert_almost_equal(my_loss, TF_final_loss, decimal=3)\n", " print(\"PASSED: final loss\")\n", "\n", " maps = [\n", " (\"dLdW\", my_dLdW, TF_dLdW),\n", " (\"dLdb\", my_dLdb, TF_dLdb),\n", " (\"dLdX\", my_dLdX, TF_dLdX),\n", " ]\n", " for (ll, k1, k2) in maps:\n", " np.testing.assert_almost_equal(k1, k2, decimal=2, err_msg=ll)\n", " print(\"PASSED: {}\".format(ll))\n", "\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Loss Function Gradients #\n", "#######################################################################\n", "\n", "\n", "def test_squared_error_grad(N=15):\n", " from numpy_ml.neural_nets.losses import SquaredError\n", " from numpy_ml.neural_nets.activations import Tanh\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SquaredError()\n", " gold = torch_mse_grad\n", " act = Tanh()\n", "\n", " i = 1\n", " while i < N:\n", " n_dims = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", " y = random_tensor((n_examples, n_dims))\n", "\n", " # raw inputs\n", " z = random_tensor((n_examples, n_dims))\n", " y_pred = act.fn(z)\n", "\n", " assert_almost_equal(\n", " mine.grad(y, y_pred, z, act), 0.5 * gold(y, z, torch.tanh), decimal=4\n", " )\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_cross_entropy_grad(N=15):\n", " from numpy_ml.neural_nets.losses import CrossEntropy\n", " from numpy_ml.neural_nets.layers import Softmax\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = CrossEntropy()\n", " gold = torch_xe_grad\n", " sm = Softmax()\n", "\n", " i = 1\n", " while i < N:\n", " n_classes = np.random.randint(2, 100)\n", " n_examples = np.random.randint(1, 1000)\n", "\n", " y = random_one_hot_matrix(n_examples, n_classes)\n", "\n", " # the cross_entropy_gradient returns the gradient wrt. z (NOT softmax(z))\n", " z = random_tensor((n_examples, n_classes))\n", " y_pred = sm.forward(z)\n", "\n", " assert_almost_equal(mine.grad(y, y_pred), gold(y, z), decimal=5)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_sigmoid_activation(N=15):\n", " from numpy_ml.neural_nets.activations import Sigmoid\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Sigmoid()\n", " gold = expit\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((1, n_dims))\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_elu_activation(N=15):\n", " from numpy_ml.neural_nets.activations import ELU\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 10)\n", " z = random_tensor((1, n_dims))\n", "\n", " alpha = np.random.uniform(0, 10)\n", "\n", " mine = ELU(alpha)\n", " gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()\n", "\n", " assert_almost_equal(mine.fn(z), gold(z, alpha))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_softmax_activation(N=15):\n", " from numpy_ml.neural_nets.layers import Softmax\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Softmax()\n", " gold = lambda z: F.softmax(torch.FloatTensor(z), dim=1).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.forward(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_relu_activation(N=15):\n", " from numpy_ml.neural_nets.activations import ReLU\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = ReLU()\n", " gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_softplus_activation(N=15):\n", " from numpy_ml.neural_nets.activations import SoftPlus\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SoftPlus()\n", " gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Activation Gradients #\n", "#######################################################################\n", "\n", "\n", "def test_sigmoid_grad(N=15):\n", " from numpy_ml.neural_nets.activations import Sigmoid\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Sigmoid()\n", " gold = torch_gradient_generator(torch.sigmoid)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_elu_grad(N=15):\n", " from numpy_ml.neural_nets.activations import ELU\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " n_dims = np.random.randint(1, 10)\n", " alpha = np.random.uniform(0, 10)\n", " z = random_tensor((n_ex, n_dims))\n", "\n", " mine = ELU(alpha)\n", " gold = torch_gradient_generator(F.elu, alpha=alpha)\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=5)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_tanh_grad(N=15):\n", " from numpy_ml.neural_nets.activations import Tanh\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Tanh()\n", " gold = torch_gradient_generator(torch.tanh)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_relu_grad(N=15):\n", " from numpy_ml.neural_nets.activations import ReLU\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = ReLU()\n", " gold = torch_gradient_generator(F.relu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_softmax_grad(N=15):\n", " from numpy_ml.neural_nets.layers import Softmax\n", " from functools import partial\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", " p_soft = partial(F.softmax, dim=1)\n", " gold = torch_gradient_generator(p_soft)\n", "\n", " i = 0\n", " while i < N:\n", " mine = Softmax()\n", " n_ex = np.random.randint(1, 3)\n", " n_dims = np.random.randint(1, 50)\n", " z = random_tensor((n_ex, n_dims), standardize=True)\n", " out = mine.forward(z)\n", "\n", " assert_almost_equal(\n", " gold(z),\n", " mine.backward(np.ones_like(out)),\n", " err_msg=\"Theirs:\\n{}\\n\\nMine:\\n{}\\n\".format(\n", " gold(z), mine.backward(np.ones_like(out))\n", " ),\n", " decimal=3,\n", " )\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_softplus_grad(N=15):\n", " from numpy_ml.neural_nets.activations import SoftPlus\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SoftPlus()\n", " gold = torch_gradient_generator(F.softplus)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims), standardize=True)\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Layers #\n", "#######################################################################\n", "\n", "\n", "def test_FullyConnected(N=15):\n", " from numpy_ml.neural_nets.layers import FullyConnected\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " n_out = np.random.randint(1, 100)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize FC layer\n", " L1 = FullyConnected(n_out=n_out, act_fn=act_fn)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchFCLayer(n_in, n_out, torch_fn, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"].T, \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (dLdy, \"dLdy\"),\n", " (L1.gradients[\"W\"].T, \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\\nact_fn={}\".format(i, act_fn_name))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_Embedding(N=15):\n", " from numpy_ml.neural_nets.layers import Embedding\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " vocab_size = np.random.randint(1, 2000)\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " emb_dim = np.random.randint(1, 100)\n", "\n", " X = np.random.randint(0, vocab_size, (n_ex, n_in))\n", "\n", " # initialize Embedding layer\n", " L1 = Embedding(n_out=emb_dim, vocab_size=vocab_size)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " # dLdX = L1.backward(dLdy)\n", " L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchEmbeddingLayer(vocab_size, emb_dim, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (dLdy, \"dLdy\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " # (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_BatchNorm1D(N=15):\n", " from numpy_ml.neural_nets.layers import BatchNorm1D\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 1000)\n", " n_in = np.random.randint(1, 1000)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # initialize BatchNorm1D layer\n", " L1 = BatchNorm1D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchBatchNormLayer(\n", " n_in, L1.parameters, \"1D\", epsilon=L1.epsilon, momentum=L1.momentum\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"scaler\"].T, \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (L1.parameters[\"running_mean\"], \"running_mean\"),\n", " # (L1.parameters[\"running_var\"], \"running_var\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_LayerNorm1D(N=15):\n", " from numpy_ml.neural_nets.layers import LayerNorm1D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 1000)\n", " n_in = np.random.randint(1, 1000)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # initialize BatchNorm1D layer\n", " L1 = LayerNorm1D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchLayerNormLayer(n_in, L1.parameters, \"1D\", epsilon=L1.epsilon)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"scaler\"].T, \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_LayerNorm2D(N=15):\n", " from numpy_ml.neural_nets.layers import LayerNorm2D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 3)\n", "\n", " # initialize LayerNorm2D layer\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", " L1 = LayerNorm2D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # standard sum loss\n", " dLdy = np.ones_like(X)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchLayerNormLayer(\n", " [n_in, in_rows, in_cols], L1.parameters, mode=\"2D\", epsilon=L1.epsilon\n", " )\n", " golds = gold_mod.extract_grads(X, Y_true=None)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (L1.hyperparameters[\"epsilon\"], \"epsilon\"),\n", " (L1.parameters[\"scaler\"], \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (y_pred, \"y\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", "\n", " i += 1\n", "\n", "\n", "def test_MultiplyLayer(N=15):\n", " from numpy_ml.neural_nets.layers import Multiply\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " Xs = []\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " n_entries = np.random.randint(2, 5)\n", " for _ in range(n_entries):\n", " Xs.append(random_tensor((n_ex, n_in), standardize=True))\n", "\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Add layer\n", " L1 = Multiply(act_fn)\n", "\n", " # forward prop\n", " y_pred = L1.forward(Xs)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdXs = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchMultiplyLayer(torch_fn)\n", " golds = gold_mod.extract_grads(Xs)\n", "\n", " params = [(Xs, \"Xs\"), (y_pred, \"Y\")]\n", " params.extend(\n", " [(dldxi, \"dLdX{}\".format(i + 1)) for i, dldxi in enumerate(dLdXs)]\n", " )\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"n_ex={}, n_in={}\".format(n_ex, n_in))\n", " print(\"n_entries={}, act_fn={}\".format(n_entries, str(act_fn)))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_AddLayer(N=15):\n", " from numpy_ml.neural_nets.layers import Add\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " Xs = []\n", " n_ex = np.random.randint(1, 100)\n", " n_in = np.random.randint(1, 100)\n", " n_entries = np.random.randint(2, 5)\n", " for _ in range(n_entries):\n", " Xs.append(random_tensor((n_ex, n_in), standardize=True))\n", "\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Add layer\n", " L1 = Add(act_fn)\n", "\n", " # forward prop\n", " y_pred = L1.forward(Xs)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdXs = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchAddLayer(torch_fn)\n", " golds = gold_mod.extract_grads(Xs)\n", "\n", " params = [(Xs, \"Xs\"), (y_pred, \"Y\")]\n", " params.extend(\n", " [(dldxi, \"dLdX{}\".format(i + 1)) for i, dldxi in enumerate(dLdXs)]\n", " )\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"n_ex={}, n_in={}\".format(n_ex, n_in))\n", " print(\"n_entries={}, act_fn={}\".format(n_entries, str(act_fn)))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=1\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_BatchNorm2D(N=15):\n", " from numpy_ml.neural_nets.layers import BatchNorm2D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 3)\n", "\n", " # initialize BatchNorm2D layer\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", " L1 = BatchNorm2D()\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # standard sum loss\n", " dLdy = np.ones_like(X)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchBatchNormLayer(\n", " n_in, L1.parameters, mode=\"2D\", epsilon=L1.epsilon, momentum=L1.momentum\n", " )\n", " golds = gold_mod.extract_grads(X, Y_true=None)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (L1.hyperparameters[\"momentum\"], \"momentum\"),\n", " (L1.hyperparameters[\"epsilon\"], \"epsilon\"),\n", " (L1.parameters[\"scaler\"].T, \"scaler\"),\n", " (L1.parameters[\"intercept\"], \"intercept\"),\n", " (L1.parameters[\"running_mean\"], \"running_mean\"),\n", " # (L1.parameters[\"running_var\"], \"running_var\"),\n", " (y_pred, \"y\"),\n", " (L1.gradients[\"scaler\"], \"dLdScaler\"),\n", " (L1.gradients[\"intercept\"], \"dLdIntercept\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", "\n", " i += 1\n", "\n", "\n", "def test_RNNCell(N=15):\n", " from numpy_ml.neural_nets.layers import RNNCell\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 10)\n", " n_t = np.random.randint(1, 10)\n", " X = random_tensor((n_ex, n_in, n_t), standardize=True)\n", "\n", " # initialize RNN layer\n", " L1 = RNNCell(n_out=n_out)\n", "\n", " # forward prop\n", " y_preds = []\n", " for t in range(n_t):\n", " y_pred = L1.forward(X[:, :, t])\n", " y_preds += [y_pred]\n", "\n", " # backprop\n", " dLdX = []\n", " dLdAt = np.ones_like(y_preds[t])\n", " for t in reversed(range(n_t)):\n", " dLdXt = L1.backward(dLdAt)\n", " dLdX.insert(0, dLdXt)\n", " dLdX = np.dstack(dLdX)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchRNNCell(n_in, n_out, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (X, \"X\"),\n", " (np.array(y_preds), \"y\"),\n", " (L1.parameters[\"ba\"].T, \"ba\"),\n", " (L1.parameters[\"bx\"].T, \"bx\"),\n", " (L1.parameters[\"Wax\"].T, \"Wax\"),\n", " (L1.parameters[\"Waa\"].T, \"Waa\"),\n", " (L1.gradients[\"ba\"].T, \"dLdBa\"),\n", " (L1.gradients[\"bx\"].T, \"dLdBx\"),\n", " (L1.gradients[\"Wax\"].T, \"dLdWax\"),\n", " (L1.gradients[\"Waa\"].T, \"dLdWaa\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Trial {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " atol=1e-3,\n", " rtol=1e-3,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_Conv2D(N=15):\n", " from numpy_ml.neural_nets.layers import Conv2D\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_shape = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " p, s = np.random.randint(0, 5), np.random.randint(1, 3)\n", " d = np.random.randint(0, 5)\n", "\n", " fr, fc = f_shape[0] * (d + 1) - d, f_shape[1] * (d + 1) - d\n", " out_rows = int(1 + (in_rows + 2 * p - fr) / s)\n", " out_cols = int(1 + (in_cols + 2 * p - fc) / s)\n", "\n", " if out_rows <= 0 or out_cols <= 0:\n", " continue\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Conv2D layer\n", " L1 = Conv2D(\n", " out_ch=n_out,\n", " kernel_shape=f_shape,\n", " act_fn=act_fn,\n", " pad=p,\n", " stride=s,\n", " dilation=d,\n", " )\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchConv2DLayer(\n", " n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"pad={}, stride={}, f_shape={}, n_ex={}\".format(p, s, f_shape, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"out_rows={}, out_cols={}, n_out={}\".format(out_rows, out_cols, n_out))\n", " print(\"dilation={}\".format(d))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_DPAttention(N=15):\n", " from numpy_ml.neural_nets.layers import DotProductAttention\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " d_k = np.random.randint(1, 100)\n", " d_v = np.random.randint(1, 100)\n", "\n", " Q = random_tensor((n_ex, d_k), standardize=True)\n", " K = random_tensor((n_ex, d_k), standardize=True)\n", " V = random_tensor((n_ex, d_v), standardize=True)\n", "\n", " # initialize DotProductAttention layer\n", " mine = DotProductAttention(scale=True, dropout_p=0)\n", "\n", " # forward prop\n", " y_pred = mine.forward(Q, K, V)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdQ, dLdK, dLdV = mine.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchSDPAttentionLayer()\n", " golds = gold_mod.extract_grads(Q, K, V)\n", "\n", " params = [\n", " (mine.X[0][0], \"Q\"),\n", " (mine.X[0][1], \"K\"),\n", " (mine.X[0][2], \"V\"),\n", " (y_pred, \"Y\"),\n", " (dLdV, \"dLdV\"),\n", " (dLdK, \"dLdK\"),\n", " (dLdQ, \"dLdQ\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"n_ex={} d_k={} d_v={}\".format(n_ex, d_k, d_v))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_Conv1D(N=15):\n", " from numpy_ml.neural_nets.layers import Conv1D\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " l_in = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_width = min(l_in, np.random.randint(1, 5))\n", " p, s = np.random.randint(0, 5), np.random.randint(1, 3)\n", " d = np.random.randint(0, 5)\n", "\n", " fc = f_width * (d + 1) - d\n", " l_out = int(1 + (l_in + 2 * p - fc) / s)\n", "\n", " if l_out <= 0:\n", " continue\n", "\n", " X = random_tensor((n_ex, l_in, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Conv2D layer\n", " L1 = Conv1D(\n", " out_ch=n_out,\n", " kernel_width=f_width,\n", " act_fn=act_fn,\n", " pad=p,\n", " stride=s,\n", " dilation=d,\n", " )\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchConv1DLayer(\n", " n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (y_pred, \"y\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"pad={}, stride={}, f_width={}, n_ex={}\".format(p, s, f_width, n_ex))\n", " print(\"l_in={}, n_in={}\".format(l_in, n_in))\n", " print(\"l_out={}, n_out={}\".format(l_out, n_out))\n", " print(\"dilation={}\".format(d))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_Deconv2D(N=15):\n", " from numpy_ml.neural_nets.layers import Deconv2D\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_shape = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " p, s = np.random.randint(0, 5), np.random.randint(1, 3)\n", "\n", " out_rows = s * (in_rows - 1) - 2 * p + f_shape[0]\n", " out_cols = s * (in_cols - 1) - 2 * p + f_shape[1]\n", "\n", " if out_rows <= 0 or out_cols <= 0:\n", " continue\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " # initialize Deconv2D layer\n", " L1 = Deconv2D(\n", " out_ch=n_out, kernel_shape=f_shape, act_fn=act_fn, pad=p, stride=s\n", " )\n", "\n", " # forward prop\n", " try:\n", " y_pred = L1.forward(X)\n", " except ValueError:\n", " print(\"Improper dimensions; retrying\")\n", " continue\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchDeconv2DLayer(\n", " n_in, n_out, torch_fn, L1.parameters, L1.hyperparameters\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (L1.X[0], \"X\"),\n", " (L1.parameters[\"W\"], \"W\"),\n", " (L1.parameters[\"b\"], \"b\"),\n", " (y_pred, \"y\"),\n", " (L1.gradients[\"W\"], \"dLdW\"),\n", " (L1.gradients[\"b\"], \"dLdB\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"pad={}, stride={}, f_shape={}, n_ex={}\".format(p, s, f_shape, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"out_rows={}, out_cols={}, n_out={}\".format(out_rows, out_cols, n_out))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_Pool2D(N=15):\n", " from numpy_ml.neural_nets.layers import Pool2D\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " in_rows = np.random.randint(1, 10)\n", " in_cols = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 3)\n", " f_shape = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " p, s = np.random.randint(0, max(1, min(f_shape) // 2)), np.random.randint(1, 3)\n", " # mode = [\"max\", \"average\"][np.random.randint(0, 2)]\n", " mode = \"average\"\n", " out_rows = int(1 + (in_rows + 2 * p - f_shape[0]) / s)\n", " out_cols = int(1 + (in_cols + 2 * p - f_shape[1]) / s)\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", " print(\"\\nmode: {}\".format(mode))\n", " print(\"pad={}, stride={}, f_shape={}, n_ex={}\".format(p, s, f_shape, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"out_rows={}, out_cols={}, n_out={}\".format(out_rows, out_cols, n_in))\n", "\n", " # initialize Pool2D layer\n", " L1 = Pool2D(kernel_shape=f_shape, pad=p, stride=s, mode=mode)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchPool2DLayer(n_in, L1.hyperparameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [(L1.X[0], \"X\"), (y_pred, \"y\"), (dLdX, \"dLdX\")]\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_LSTMCell(N=15):\n", " from numpy_ml.neural_nets.layers import LSTMCell\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 10)\n", " n_t = np.random.randint(1, 10)\n", " X = random_tensor((n_ex, n_in, n_t), standardize=True)\n", "\n", " # initialize LSTM layer\n", " L1 = LSTMCell(n_out=n_out)\n", "\n", " # forward prop\n", " Cs = []\n", " y_preds = []\n", " for t in range(n_t):\n", " y_pred, Ct = L1.forward(X[:, :, t])\n", " y_preds.append(y_pred)\n", " Cs.append(Ct)\n", "\n", " # backprop\n", " dLdX = []\n", " dLdAt = np.ones_like(y_preds[t])\n", " for t in reversed(range(n_t)):\n", " dLdXt = L1.backward(dLdAt)\n", " dLdX.insert(0, dLdXt)\n", " dLdX = np.dstack(dLdX)\n", " y_preds = np.dstack(y_preds)\n", " Cs = np.array(Cs)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchLSTMCell(n_in, n_out, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = [\n", " (X, \"X\"),\n", " (np.array(Cs), \"C\"),\n", " (y_preds, \"y\"),\n", " (L1.parameters[\"bo\"].T, \"bo\"),\n", " (L1.parameters[\"bu\"].T, \"bu\"),\n", " (L1.parameters[\"bf\"].T, \"bf\"),\n", " (L1.parameters[\"bc\"].T, \"bc\"),\n", " (L1.parameters[\"Wo\"], \"Wo\"),\n", " (L1.parameters[\"Wu\"], \"Wu\"),\n", " (L1.parameters[\"Wf\"], \"Wf\"),\n", " (L1.parameters[\"Wc\"], \"Wc\"),\n", " (L1.gradients[\"bo\"].T, \"dLdBo\"),\n", " (L1.gradients[\"bu\"].T, \"dLdBu\"),\n", " (L1.gradients[\"bf\"].T, \"dLdBf\"),\n", " (L1.gradients[\"bc\"].T, \"dLdBc\"),\n", " (L1.gradients[\"Wo\"], \"dLdWo\"),\n", " (L1.gradients[\"Wu\"], \"dLdWu\"),\n", " (L1.gradients[\"Wf\"], \"dLdWf\"),\n", " (L1.gradients[\"Wc\"], \"dLdWc\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Case {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " atol=1e-4,\n", " rtol=1e-4,\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def grad_check_RNN(model, loss_func, param_name, n_t, X, epsilon=1e-7):\n", " \"\"\"\n", " Manual gradient calc for vanilla RNN parameters\n", " \"\"\"\n", " if param_name in [\"Ba\", \"Bx\"]:\n", " param_name = param_name.lower()\n", " elif param_name in [\"X\", \"y\"]:\n", " return None\n", "\n", " param_orig = model.parameters[param_name]\n", " model.flush_gradients()\n", " grads = np.zeros_like(param_orig)\n", "\n", " for flat_ix, val in enumerate(param_orig.flat):\n", " param = deepcopy(param_orig)\n", " md_ix = np.unravel_index(flat_ix, param.shape)\n", "\n", " # plus\n", " y_preds_plus = []\n", " param[md_ix] = val + epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_plus = model.forward(X[:, :, t])\n", " y_preds_plus += [y_pred_plus]\n", " loss_plus = loss_func(y_preds_plus)\n", " model.flush_gradients()\n", "\n", " # minus\n", " y_preds_minus = []\n", " param[md_ix] = val - epsilon\n", " model.parameters[param_name] = param\n", " for t in range(n_t):\n", " y_pred_minus = model.forward(X[:, :, t])\n", " y_preds_minus += [y_pred_minus]\n", " loss_minus = loss_func(y_preds_minus)\n", " model.flush_gradients()\n", "\n", " grad = (loss_plus - loss_minus) / (2 * epsilon)\n", " grads[md_ix] = grad\n", " return grads.T\n", "\n", "\n", "#######################################################################\n", "# Modules #\n", "#######################################################################\n", "\n", "\n", "def test_MultiHeadedAttentionModule(N=15):\n", " from numpy_ml.neural_nets.modules import MultiHeadedAttentionModule\n", "\n", " N = np.inf if N is None else N\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " latent_dim = np.random.randint(1, 20)\n", " n_heads = np.random.randint(2, 10)\n", " d_k = d_v = n_heads * latent_dim\n", "\n", " Q = random_tensor((n_ex, d_k), standardize=True)\n", " K = random_tensor((n_ex, d_k), standardize=True)\n", " V = random_tensor((n_ex, d_v), standardize=True)\n", "\n", " mine = MultiHeadedAttentionModule(n_heads=n_heads, dropout_p=0)\n", "\n", " # forward prop\n", " y_pred = mine.forward(Q, K, V)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdQ, dLdK, dLdV = mine.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " params = mine.parameters\n", " hparams = mine.hyperparameters\n", " gold_mod = TorchMultiHeadedAttentionModule(params, hparams)\n", " golds = gold_mod.extract_grads(Q, K, V)\n", "\n", " dv = mine.derived_variables\n", " params = mine.parameters[\"components\"]\n", " grads = mine.gradients[\"components\"]\n", " params = [\n", " (Q, \"Q\"),\n", " (K, \"K\"),\n", " (V, \"V\"),\n", " (mine.n_heads, \"n_heads\"),\n", " (mine.latent_dim, \"latent_dim\"),\n", " (params[\"O\"][\"W\"], \"O_W\"),\n", " (params[\"K\"][\"W\"], \"K_W\"),\n", " (params[\"V\"][\"W\"], \"V_W\"),\n", " (params[\"Q\"][\"W\"], \"Q_W\"),\n", " (params[\"O\"][\"b\"], \"O_b\"),\n", " (params[\"K\"][\"b\"], \"K_b\"),\n", " (params[\"V\"][\"b\"], \"V_b\"),\n", " (params[\"Q\"][\"b\"], \"Q_b\"),\n", " (dv[\"Q_proj\"], \"Q_proj\"),\n", " (dv[\"K_proj\"], \"K_proj\"),\n", " (dv[\"V_proj\"], \"V_proj\"),\n", " (dv[\"attention_weights\"][0], \"weights\"),\n", " (dv[\"attention_out\"], \"attn_out\"),\n", " (y_pred, \"Y\"),\n", " (dLdy, \"dLdy\"),\n", " (dv[\"dQ_proj\"], \"dQ_proj\"),\n", " (dv[\"dK_proj\"], \"dK_proj\"),\n", " (dv[\"dV_proj\"], \"dV_proj\"),\n", " (grads[\"O\"][\"W\"], \"dO_W\"),\n", " (grads[\"V\"][\"W\"], \"dV_W\"),\n", " (grads[\"K\"][\"W\"], \"dK_W\"),\n", " (grads[\"Q\"][\"W\"], \"dQ_W\"),\n", " (grads[\"O\"][\"b\"], \"dO_b\"),\n", " (grads[\"V\"][\"b\"], \"dV_b\"),\n", " (grads[\"K\"][\"b\"], \"dK_b\"),\n", " (grads[\"Q\"][\"b\"], \"dQ_b\"),\n", " (dLdQ, \"dQ\"),\n", " (dLdK, \"dK\"),\n", " (dLdV, \"dV\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\n", " \"n_ex={} d_k=d_v={} latent_dim={} n_heads={}\".format(\n", " n_ex, d_k, latent_dim, n_heads\n", " )\n", " )\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_SkipConnectionIdentityModule(N=15):\n", " from numpy_ml.neural_nets.modules import SkipConnectionIdentityModule\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(2, 25)\n", " in_cols = np.random.randint(2, 25)\n", " n_in = np.random.randint(2, 5)\n", " n_out = n_in\n", " f_shape1 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " f_shape2 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " s1 = np.random.randint(1, 5)\n", " s2 = np.random.randint(1, 5)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " p1 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape1, s1)\n", " if p1[0] != p1[1] or p1[2] != p1[3]:\n", " continue\n", "\n", " p2 = calc_pad_dims_2D(X.shape, X.shape[1:3], f_shape2, s2)\n", " if p2[0] != p2[1] or p2[2] != p2[3]:\n", " continue\n", "\n", " p1 = (p1[0], p1[2])\n", " p2 = (p2[0], p2[2])\n", "\n", " # initialize SkipConnectionIdentity module\n", " L1 = SkipConnectionIdentityModule(\n", " out_ch=n_out,\n", " kernel_shape1=f_shape1,\n", " kernel_shape2=f_shape2,\n", " stride1=s1,\n", " stride2=s2,\n", " act_fn=act_fn,\n", " epsilon=1e-5,\n", " momentum=0.9,\n", " )\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchSkipConnectionIdentity(\n", " torch_fn,\n", " p1,\n", " p2,\n", " L1.parameters,\n", " L1.hyperparameters,\n", " momentum=L1.momentum,\n", " epsilon=L1.epsilon,\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = L1.parameters[\"components\"]\n", " grads = L1.gradients[\"components\"]\n", " params = [\n", " (X, \"X\"),\n", " (params[\"conv1\"][\"W\"], \"conv1_W\"),\n", " (params[\"conv1\"][\"b\"], \"conv1_b\"),\n", " (params[\"batchnorm1\"][\"scaler\"].T, \"bn1_scaler\"),\n", " (params[\"batchnorm1\"][\"intercept\"], \"bn1_intercept\"),\n", " (params[\"batchnorm1\"][\"running_mean\"], \"bn1_running_mean\"),\n", " # (params[\"batchnorm1\"][\"running_var\"], \"bn1_running_var\"),\n", " (params[\"conv2\"][\"W\"], \"conv2_W\"),\n", " (params[\"conv2\"][\"b\"], \"conv2_b\"),\n", " (params[\"batchnorm2\"][\"scaler\"].T, \"bn2_scaler\"),\n", " (params[\"batchnorm2\"][\"intercept\"], \"bn2_intercept\"),\n", " (params[\"batchnorm2\"][\"running_mean\"], \"bn2_running_mean\"),\n", " # (params[\"batchnorm2\"][\"running_var\"], \"bn2_running_var\"),\n", " (L1._dv[\"conv1_out\"], \"act1_out\"),\n", " (L1._dv[\"batchnorm1_out\"], \"bn1_out\"),\n", " (L1._dv[\"conv2_out\"], \"conv2_out\"),\n", " (L1._dv[\"batchnorm2_out\"], \"bn2_out\"),\n", " (y_pred, \"Y\"),\n", " (dLdy, \"dLdY\"),\n", " (L1.derived_variables[\"dLdBn2\"], \"dLdBn2_out\"),\n", " (L1.derived_variables[\"dLdConv2\"], \"dLdConv2_out\"),\n", " (L1.derived_variables[\"dLdBn1\"], \"dLdBn1_out\"),\n", " (L1.derived_variables[\"dLdConv1\"], \"dLdActFn1_out\"),\n", " (dLdX, \"dLdX\"),\n", " (grads[\"batchnorm2\"][\"scaler\"].T, \"dLdBn2_scaler\"),\n", " (grads[\"batchnorm2\"][\"intercept\"], \"dLdBn2_intercept\"),\n", " (grads[\"conv2\"][\"W\"], \"dLdConv2_W\"),\n", " (grads[\"conv2\"][\"b\"], \"dLdConv2_b\"),\n", " (grads[\"batchnorm1\"][\"scaler\"].T, \"dLdBn1_scaler\"),\n", " (grads[\"batchnorm1\"][\"intercept\"], \"dLdBn1_intercept\"),\n", " (grads[\"conv1\"][\"W\"], \"dLdConv1_W\"),\n", " (grads[\"conv1\"][\"b\"], \"dLdConv1_b\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"act_fn={}, n_ex={}\".format(act_fn, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"pad1={}, stride1={}, f_shape1={}\".format(p1, s1, f_shape1))\n", " print(\"pad2={}, stride2={}, f_shape2={}\".format(p2, s2, f_shape2))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=2\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_SkipConnectionConvModule(N=15):\n", " from numpy_ml.neural_nets.modules import SkipConnectionConvModule\n", " from numpy_ml.neural_nets.activations import Tanh, ReLU, Sigmoid, Affine\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " acts = [\n", " (Tanh(), nn.Tanh(), \"Tanh\"),\n", " (Sigmoid(), nn.Sigmoid(), \"Sigmoid\"),\n", " (ReLU(), nn.ReLU(), \"ReLU\"),\n", " (Affine(), TorchLinearActivation(), \"Affine\"),\n", " ]\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(2, 10)\n", " in_rows = np.random.randint(2, 10)\n", " in_cols = np.random.randint(2, 10)\n", " n_in = np.random.randint(2, 5)\n", " n_out1 = np.random.randint(2, 5)\n", " n_out2 = np.random.randint(2, 5)\n", " f_shape1 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " f_shape2 = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", " f_shape_skip = (\n", " min(in_rows, np.random.randint(1, 5)),\n", " min(in_cols, np.random.randint(1, 5)),\n", " )\n", "\n", " s1 = np.random.randint(1, 5)\n", " s2 = np.random.randint(1, 5)\n", " s_skip = np.random.randint(1, 5)\n", "\n", " # randomly select an activation function\n", " act_fn, torch_fn, act_fn_name = acts[np.random.randint(0, len(acts))]\n", "\n", " X = random_tensor((n_ex, in_rows, in_cols, n_in), standardize=True)\n", "\n", " p1 = (np.random.randint(1, 5), np.random.randint(1, 5))\n", " p2 = (np.random.randint(1, 5), np.random.randint(1, 5))\n", "\n", " # initialize SkipConnectionConv module\n", " L1 = SkipConnectionConvModule(\n", " out_ch1=n_out1,\n", " out_ch2=n_out2,\n", " kernel_shape1=f_shape1,\n", " kernel_shape2=f_shape2,\n", " kernel_shape_skip=f_shape_skip,\n", " stride1=s1,\n", " stride2=s2,\n", " stride_skip=s_skip,\n", " pad1=p1,\n", " pad2=p2,\n", " act_fn=act_fn,\n", " epsilon=1e-5,\n", " momentum=0.9,\n", " )\n", "\n", " # forward prop\n", " try:\n", " y_pred = L1.forward(X)\n", " except (ValueError, AssertionError):\n", " print(\"Invalid padding; Retrying\")\n", " continue\n", "\n", " ps = L1.hyperparameters[\"pad_skip\"]\n", " if ps[0] != ps[1] or ps[2] != ps[3]:\n", " continue\n", " pad_skip = (ps[0], ps[2])\n", "\n", " # backprop\n", " dLdy = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdy)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchSkipConnectionConv(\n", " torch_fn,\n", " p1,\n", " p2,\n", " pad_skip,\n", " L1.parameters,\n", " L1.hyperparameters,\n", " momentum=L1.momentum,\n", " epsilon=L1.epsilon,\n", " )\n", " golds = gold_mod.extract_grads(X)\n", "\n", " params = L1.parameters[\"components\"]\n", " grads = L1.gradients[\"components\"]\n", " params = [\n", " (X, \"X\"),\n", " (params[\"conv1\"][\"W\"], \"conv1_W\"),\n", " (params[\"conv1\"][\"b\"], \"conv1_b\"),\n", " (params[\"batchnorm1\"][\"scaler\"].T, \"bn1_scaler\"),\n", " (params[\"batchnorm1\"][\"intercept\"], \"bn1_intercept\"),\n", " (params[\"batchnorm1\"][\"running_mean\"], \"bn1_running_mean\"),\n", " # (params[\"batchnorm1\"][\"running_var\"], \"bn1_running_var\"),\n", " (params[\"conv2\"][\"W\"], \"conv2_W\"),\n", " (params[\"conv2\"][\"b\"], \"conv2_b\"),\n", " (params[\"batchnorm2\"][\"scaler\"].T, \"bn2_scaler\"),\n", " (params[\"batchnorm2\"][\"intercept\"], \"bn2_intercept\"),\n", " (params[\"batchnorm2\"][\"running_mean\"], \"bn2_running_mean\"),\n", " # (params[\"batchnorm2\"][\"running_var\"], \"bn2_running_var\"),\n", " (params[\"conv_skip\"][\"W\"], \"conv_skip_W\"),\n", " (params[\"conv_skip\"][\"b\"], \"conv_skip_b\"),\n", " (params[\"batchnorm_skip\"][\"scaler\"].T, \"bn_skip_scaler\"),\n", " (params[\"batchnorm_skip\"][\"intercept\"], \"bn_skip_intercept\"),\n", " (params[\"batchnorm_skip\"][\"running_mean\"], \"bn_skip_running_mean\"),\n", " # (params[\"batchnorm_skip\"][\"running_var\"], \"bn_skip_running_var\"),\n", " (L1._dv[\"conv1_out\"], \"act1_out\"),\n", " (L1._dv[\"batchnorm1_out\"], \"bn1_out\"),\n", " (L1._dv[\"conv2_out\"], \"conv2_out\"),\n", " (L1._dv[\"batchnorm2_out\"], \"bn2_out\"),\n", " (L1._dv[\"conv_skip_out\"], \"conv_skip_out\"),\n", " (L1._dv[\"batchnorm_skip_out\"], \"bn_skip_out\"),\n", " (y_pred, \"Y\"),\n", " (dLdy, \"dLdY\"),\n", " (L1.derived_variables[\"dLdBn2\"], \"dLdBn2_out\"),\n", " (L1.derived_variables[\"dLdConv2\"], \"dLdConv2_out\"),\n", " (L1.derived_variables[\"dLdBnSkip\"], \"dLdBnSkip_out\"),\n", " (L1.derived_variables[\"dLdConvSkip\"], \"dLdConvSkip_out\"),\n", " (L1.derived_variables[\"dLdBn1\"], \"dLdBn1_out\"),\n", " (L1.derived_variables[\"dLdConv1\"], \"dLdActFn1_out\"),\n", " (dLdX, \"dLdX\"),\n", " (grads[\"batchnorm_skip\"][\"scaler\"].T, \"dLdBnSkip_scaler\"),\n", " (grads[\"batchnorm_skip\"][\"intercept\"], \"dLdBnSkip_intercept\"),\n", " (grads[\"conv_skip\"][\"W\"], \"dLdConvSkip_W\"),\n", " (grads[\"conv_skip\"][\"b\"], \"dLdConvSkip_b\"),\n", " (grads[\"batchnorm2\"][\"scaler\"].T, \"dLdBn2_scaler\"),\n", " (grads[\"batchnorm2\"][\"intercept\"], \"dLdBn2_intercept\"),\n", " (grads[\"conv2\"][\"W\"], \"dLdConv2_W\"),\n", " (grads[\"conv2\"][\"b\"], \"dLdConv2_b\"),\n", " (grads[\"batchnorm1\"][\"scaler\"].T, \"dLdBn1_scaler\"),\n", " (grads[\"batchnorm1\"][\"intercept\"], \"dLdBn1_intercept\"),\n", " (grads[\"conv1\"][\"W\"], \"dLdConv1_W\"),\n", " (grads[\"conv1\"][\"b\"], \"dLdConv1_b\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"act_fn={}, n_ex={}\".format(act_fn, n_ex))\n", " print(\"in_rows={}, in_cols={}, n_in={}\".format(in_rows, in_cols, n_in))\n", " print(\"pad1={}, stride1={}, f_shape1={}\".format(p1, s1, f_shape1))\n", " print(\"pad2={}, stride2={}, f_shape2={}\".format(p2, s2, f_shape2))\n", " print(\"stride_skip={}, f_shape_skip={}\".format(s_skip, f_shape_skip))\n", " warn_str = (\n", " \"\\n[NOTE] The tests in this module can fail sometimes during \"\n", " \"backprop due to the ReLU issue: while the difference in the forward pass \"\n", " \"between z=-1e-9 and z=1e-9 is miniscule, the difference during the backward \"\n", " \"pass is significant due to ReLU's kink about 0.\"\n", " )\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix, warn_str),\n", " decimal=2,\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_BidirectionalLSTM(N=15):\n", " from numpy_ml.neural_nets.modules import BidirectionalLSTM\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " n_in = np.random.randint(1, 10)\n", " n_out = np.random.randint(1, 10)\n", " n_t = np.random.randint(1, 10)\n", " X = random_tensor((n_ex, n_in, n_t), standardize=True)\n", "\n", " # initialize LSTM layer\n", " L1 = BidirectionalLSTM(n_out=n_out)\n", "\n", " # forward prop\n", " y_pred = L1.forward(X)\n", "\n", " # backprop\n", " dLdA = np.ones_like(y_pred)\n", " dLdX = L1.backward(dLdA)\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchBidirectionalLSTM(n_in, n_out, L1.parameters)\n", " golds = gold_mod.extract_grads(X)\n", "\n", " pms, grads = L1.parameters[\"components\"], L1.gradients[\"components\"]\n", " params = [\n", " (X, \"X\"),\n", " (y_pred, \"y\"),\n", " (pms[\"cell_fwd\"][\"bo\"].T, \"bo_f\"),\n", " (pms[\"cell_fwd\"][\"bu\"].T, \"bu_f\"),\n", " (pms[\"cell_fwd\"][\"bf\"].T, \"bf_f\"),\n", " (pms[\"cell_fwd\"][\"bc\"].T, \"bc_f\"),\n", " (pms[\"cell_fwd\"][\"Wo\"], \"Wo_f\"),\n", " (pms[\"cell_fwd\"][\"Wu\"], \"Wu_f\"),\n", " (pms[\"cell_fwd\"][\"Wf\"], \"Wf_f\"),\n", " (pms[\"cell_fwd\"][\"Wc\"], \"Wc_f\"),\n", " (pms[\"cell_bwd\"][\"bo\"].T, \"bo_b\"),\n", " (pms[\"cell_bwd\"][\"bu\"].T, \"bu_b\"),\n", " (pms[\"cell_bwd\"][\"bf\"].T, \"bf_b\"),\n", " (pms[\"cell_bwd\"][\"bc\"].T, \"bc_b\"),\n", " (pms[\"cell_bwd\"][\"Wo\"], \"Wo_b\"),\n", " (pms[\"cell_bwd\"][\"Wu\"], \"Wu_b\"),\n", " (pms[\"cell_bwd\"][\"Wf\"], \"Wf_b\"),\n", " (pms[\"cell_bwd\"][\"Wc\"], \"Wc_b\"),\n", " (grads[\"cell_fwd\"][\"bo\"].T, \"dLdBo_f\"),\n", " (grads[\"cell_fwd\"][\"bu\"].T, \"dLdBu_f\"),\n", " (grads[\"cell_fwd\"][\"bf\"].T, \"dLdBf_f\"),\n", " (grads[\"cell_fwd\"][\"bc\"].T, \"dLdBc_f\"),\n", " (grads[\"cell_fwd\"][\"Wo\"], \"dLdWo_f\"),\n", " (grads[\"cell_fwd\"][\"Wu\"], \"dLdWu_f\"),\n", " (grads[\"cell_fwd\"][\"Wf\"], \"dLdWf_f\"),\n", " (grads[\"cell_fwd\"][\"Wc\"], \"dLdWc_f\"),\n", " (grads[\"cell_bwd\"][\"bo\"].T, \"dLdBo_b\"),\n", " (grads[\"cell_bwd\"][\"bu\"].T, \"dLdBu_b\"),\n", " (grads[\"cell_bwd\"][\"bf\"].T, \"dLdBf_b\"),\n", " (grads[\"cell_bwd\"][\"bc\"].T, \"dLdBc_b\"),\n", " (grads[\"cell_bwd\"][\"Wo\"], \"dLdWo_b\"),\n", " (grads[\"cell_bwd\"][\"Wu\"], \"dLdWu_b\"),\n", " (grads[\"cell_bwd\"][\"Wf\"], \"dLdWf_b\"),\n", " (grads[\"cell_bwd\"][\"Wc\"], \"dLdWc_b\"),\n", " (dLdX, \"dLdX\"),\n", " ]\n", "\n", " print(\"Case {}\".format(i))\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_allclose(\n", " mine,\n", " golds[label],\n", " err_msg=err_fmt(params, golds, ix),\n", " atol=1e-4,\n", " rtol=1e-4,\n", " )\n", "\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "def test_WaveNetModule(N=10):\n", " from numpy_ml.neural_nets.modules import WavenetResidualModule\n", "\n", " N = np.inf if N is None else N\n", "\n", " np.random.seed(12345)\n", "\n", " i = 1\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 10)\n", " l_in = np.random.randint(1, 10)\n", " ch_residual, ch_dilation = np.random.randint(1, 5), np.random.randint(1, 5)\n", " f_width = min(l_in, np.random.randint(1, 5))\n", " d = np.random.randint(0, 5)\n", "\n", " X_main = np.zeros_like(\n", " random_tensor((n_ex, l_in, ch_residual), standardize=True)\n", " )\n", " X_main[0][0][0] = 1.0\n", " X_skip = np.zeros_like(\n", " random_tensor((n_ex, l_in, ch_residual), standardize=True)\n", " )\n", "\n", " # initialize Conv2D layer\n", " L1 = WavenetResidualModule(\n", " ch_residual=ch_residual,\n", " ch_dilation=ch_dilation,\n", " kernel_width=f_width,\n", " dilation=d,\n", " )\n", "\n", " # forward prop\n", " Y_main, Y_skip = L1.forward(X_main, X_skip)\n", "\n", " # backprop\n", " dLdY_skip = np.ones_like(Y_skip)\n", " dLdY_main = np.ones_like(Y_main)\n", " dLdX_main, dLdX_skip = L1.backward(dLdY_skip, dLdY_main)\n", "\n", " _, conv_1x1_pad = pad1D(\n", " L1._dv[\"multiply_gate_out\"], \"same\", kernel_width=1, stride=1, dilation=0\n", " )\n", " if conv_1x1_pad[0] != conv_1x1_pad[1]:\n", " print(\"Skipping\")\n", " continue\n", "\n", " conv_1x1_pad = conv_1x1_pad[0]\n", "\n", " # get gold standard gradients\n", " gold_mod = TorchWavenetModule(L1.parameters, L1.hyperparameters, conv_1x1_pad)\n", " golds = gold_mod.extract_grads(X_main, X_skip)\n", "\n", " dv = L1.derived_variables\n", " pc = L1.parameters[\"components\"]\n", " gr = L1.gradients[\"components\"]\n", "\n", " params = [\n", " (L1.X_main, \"X_main\"),\n", " (L1.X_skip, \"X_skip\"),\n", " (pc[\"conv_dilation\"][\"W\"], \"conv_dilation_W\"),\n", " (pc[\"conv_dilation\"][\"b\"], \"conv_dilation_b\"),\n", " (pc[\"conv_1x1\"][\"W\"], \"conv_1x1_W\"),\n", " (pc[\"conv_1x1\"][\"b\"], \"conv_1x1_b\"),\n", " (dv[\"conv_dilation_out\"], \"conv_dilation_out\"),\n", " (dv[\"tanh_out\"], \"tanh_out\"),\n", " (dv[\"sigm_out\"], \"sigm_out\"),\n", " (dv[\"multiply_gate_out\"], \"multiply_gate_out\"),\n", " (dv[\"conv_1x1_out\"], \"conv_1x1_out\"),\n", " (Y_main, \"Y_main\"),\n", " (Y_skip, \"Y_skip\"),\n", " (dLdY_skip, \"dLdY_skip\"),\n", " (dLdY_main, \"dLdY_main\"),\n", " (dv[\"dLdConv_1x1\"], \"dLdConv_1x1_out\"),\n", " (gr[\"conv_1x1\"][\"W\"], \"dLdConv_1x1_W\"),\n", " (gr[\"conv_1x1\"][\"b\"], \"dLdConv_1x1_b\"),\n", " (dv[\"dLdMultiply\"], \"dLdMultiply_out\"),\n", " (dv[\"dLdTanh\"], \"dLdTanh_out\"),\n", " (dv[\"dLdSigmoid\"], \"dLdSigm_out\"),\n", " (dv[\"dLdConv_dilation\"], \"dLdConv_dilation_out\"),\n", " (gr[\"conv_dilation\"][\"W\"], \"dLdConv_dilation_W\"),\n", " (gr[\"conv_dilation\"][\"b\"], \"dLdConv_dilation_b\"),\n", " (dLdX_main, \"dLdX_main\"),\n", " (dLdX_skip, \"dLdX_skip\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"f_width={}, n_ex={}\".format(f_width, n_ex))\n", " print(\"l_in={}, ch_residual={}\".format(l_in, ch_residual))\n", " print(\"ch_dilation={} dilation={}\".format(ch_dilation, d))\n", " for ix, (mine, label) in enumerate(params):\n", " assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=4\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Utils #\n", "#######################################################################\n", "\n", "\n", "def test_pad1D(N=15):\n", " from numpy_ml.neural_nets.layers import Conv1D\n", " from .nn_torch_models import TorchCausalConv1d, torchify\n", "\n", " np.random.seed(12345)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " p = np.random.choice([\"same\", \"causal\"])\n", " n_ex = np.random.randint(1, 10)\n", " l_in = np.random.randint(1, 10)\n", " n_in, n_out = np.random.randint(1, 3), np.random.randint(1, 3)\n", " f_width = min(l_in, np.random.randint(1, 5))\n", " s = np.random.randint(1, 3)\n", " d = np.random.randint(0, 5)\n", "\n", " X = random_tensor((n_ex, l_in, n_in), standardize=True)\n", " X_pad, _ = pad1D(X, p, kernel_width=f_width, stride=s, dilation=d)\n", "\n", " # initialize Conv2D layer\n", " L1 = Conv1D(out_ch=n_out, kernel_width=f_width, pad=0, stride=s, dilation=d)\n", "\n", " # forward prop\n", " try:\n", " y_pred = L1.forward(X_pad)\n", " except ValueError:\n", " continue\n", "\n", " # ignore n. output channels\n", " print(\"Trial {}\".format(i))\n", " print(\"p={} d={} s={} l_in={} f_width={}\".format(p, d, s, l_in, f_width))\n", " print(\"n_ex={} n_in={} n_out={}\".format(n_ex, n_in, n_out))\n", " assert y_pred.shape[:2] == X.shape[:2], print(\n", " \"y_pred.shape={} X.shape={}\".format(y_pred.shape, X.shape)\n", " )\n", "\n", " if p == \"causal\":\n", " gold = TorchCausalConv1d(\n", " in_channels=n_in,\n", " out_channels=n_out,\n", " kernel_size=f_width,\n", " stride=s,\n", " dilation=d + 1,\n", " bias=True,\n", " )\n", " if s != 1:\n", " print(\n", " \"TorchCausalConv1D does not do `same` padding for stride > 1. Skipping\"\n", " )\n", " continue\n", "\n", " XT = torchify(np.moveaxis(X, [0, 1, 2], [0, -1, -2]))\n", " else:\n", " gold = nn.Conv1d(\n", " in_channels=n_in,\n", " out_channels=n_out,\n", " kernel_size=f_width,\n", " padding=0,\n", " stride=s,\n", " dilation=d + 1,\n", " bias=True,\n", " )\n", " XT = torchify(np.moveaxis(X_pad, [0, 1, 2], [0, -1, -2]))\n", "\n", " # import weights and biases\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " b = L1.parameters[\"b\"]\n", " W = np.moveaxis(L1.parameters[\"W\"], [0, 1, 2], [-1, -2, -3])\n", " assert gold.weight.shape == W.shape\n", " assert gold.bias.shape == b.flatten().shape\n", "\n", " gold.weight = nn.Parameter(torch.FloatTensor(W))\n", " gold.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " outT = gold(XT)\n", " if outT.ndimension() == 2:\n", " import ipdb\n", "\n", " ipdb.set_trace()\n", "\n", " gold_out = np.moveaxis(outT.detach().numpy(), [0, 1, 2], [0, -1, -2])\n", " assert gold_out.shape[:2] == X.shape[:2]\n", "\n", " np.testing.assert_almost_equal(\n", " y_pred,\n", " gold_out,\n", " err_msg=err_fmt(\n", " [(y_pred.shape, \"out.shape\"), (y_pred, \"out\")],\n", " {\"out.shape\": gold_out.shape, \"out\": gold_out},\n", " 1,\n", " ),\n", " decimal=4,\n", " )\n", " print(\"PASSED\\n\")\n", " i += 1\n", "\n", "\n", "def test_conv(N=15):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(2, 15)\n", " in_rows = np.random.randint(2, 15)\n", " in_cols = np.random.randint(2, 15)\n", " in_ch = np.random.randint(2, 15)\n", " out_ch = np.random.randint(2, 15)\n", " f_shape = (\n", " min(in_rows, np.random.randint(2, 10)),\n", " min(in_cols, np.random.randint(2, 10)),\n", " )\n", " s = np.random.randint(1, 3)\n", " p = np.random.randint(0, 5)\n", "\n", " X = np.random.rand(n_ex, in_rows, in_cols, in_ch)\n", " X_pad, p = pad2D(X, p)\n", " W = np.random.randn(f_shape[0], f_shape[1], in_ch, out_ch)\n", "\n", " gold = conv2D_naive(X, W, s, p)\n", " mine = conv2D(X, W, s, p)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Models #\n", "#######################################################################\n", "\n", "\n", "def fit_VAE():\n", " # for testing\n", " import tensorflow.keras.datasets.mnist as mnist\n", " from numpy_ml.neural_nets.models.vae import BernoulliVAE\n", "\n", " np.random.seed(12345)\n", "\n", " (X_train, y_train), (X_test, y_test) = mnist.load_data()\n", "\n", " # scale pixel intensities to [0, 1]\n", " X_train = np.expand_dims(X_train.astype(\"float32\") / 255.0, 3)\n", " X_test = np.expand_dims(X_test.astype(\"float32\") / 255.0, 3)\n", "\n", " X_train = X_train[: 128 * 1] # 1 batch\n", "\n", " BV = BernoulliVAE()\n", " BV.fit(X_train, n_epochs=1, verbose=False)\n", "\n", "\n", "def test_WGAN_GP(N=1):\n", " from numpy_ml.neural_nets.models.wgan_gp import WGAN_GP\n", "\n", " np.random.seed(12345)\n", "\n", " ss = np.random.randint(0, 1000)\n", " np.random.seed(ss)\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " while i < N + 1:\n", " c_updates_per_epoch, n_steps = 1, 1\n", " n_ex = np.random.randint(1, 500)\n", " n_in = np.random.randint(1, 100)\n", " lambda_ = np.random.randint(0, 20)\n", " g_hidden = np.random.randint(2, 500)\n", " X = random_tensor((n_ex, n_in), standardize=True)\n", "\n", " # initialize WGAN_GP model\n", " L1 = WGAN_GP(g_hidden=g_hidden, debug=True)\n", "\n", " # forward prop\n", " batchsize = n_ex\n", " L1.fit(\n", " X,\n", " lambda_=lambda_,\n", " c_updates_per_epoch=c_updates_per_epoch,\n", " n_steps=n_steps,\n", " batchsize=batchsize,\n", " )\n", "\n", " # backprop\n", " dv = L1.derived_variables\n", " params = L1.parameters[\"components\"]\n", " grads = L1.gradients[\"components\"]\n", " params[\"noise\"] = dv[\"noise\"]\n", " params[\"alpha\"] = dv[\"alpha\"]\n", " params[\"n_in\"] = n_in\n", " params[\"g_hidden\"] = g_hidden\n", " params[\"c_updates_per_epoch\"] = c_updates_per_epoch\n", " params[\"n_steps\"] = n_steps\n", "\n", " # get gold standard gradients\n", " golds = WGAN_GP_tf(X, lambda_=lambda_, batch_size=batchsize, params=params)\n", "\n", " params = [\n", " (dv[\"X_real\"], \"X_real\"),\n", " (params[\"generator\"][\"FC1\"][\"W\"], \"G_weights_FC1\"),\n", " (params[\"generator\"][\"FC2\"][\"W\"], \"G_weights_FC2\"),\n", " (params[\"generator\"][\"FC3\"][\"W\"], \"G_weights_FC3\"),\n", " (params[\"generator\"][\"FC4\"][\"W\"], \"G_weights_FC4\"),\n", " (dv[\"G_fwd_X_fake\"][\"FC1\"], \"G_fwd_X_fake_FC1\"),\n", " (dv[\"G_fwd_X_fake\"][\"FC2\"], \"G_fwd_X_fake_FC2\"),\n", " (dv[\"G_fwd_X_fake\"][\"FC3\"], \"G_fwd_X_fake_FC3\"),\n", " (dv[\"G_fwd_X_fake\"][\"FC4\"], \"G_fwd_X_fake_FC4\"),\n", " (dv[\"X_fake\"], \"X_fake\"),\n", " (dv[\"X_interp\"], \"X_interp\"),\n", " (params[\"critic\"][\"FC1\"][\"W\"], \"C_weights_Y_real_FC1\"),\n", " (params[\"critic\"][\"FC2\"][\"W\"], \"C_weights_Y_real_FC2\"),\n", " (params[\"critic\"][\"FC3\"][\"W\"], \"C_weights_Y_real_FC3\"),\n", " (params[\"critic\"][\"FC4\"][\"W\"], \"C_weights_Y_real_FC4\"),\n", " (dv[\"C_fwd_Y_real\"][\"FC1\"], \"C_fwd_Y_real_FC1\"),\n", " (dv[\"C_fwd_Y_real\"][\"FC2\"], \"C_fwd_Y_real_FC2\"),\n", " (dv[\"C_fwd_Y_real\"][\"FC3\"], \"C_fwd_Y_real_FC3\"),\n", " (dv[\"C_fwd_Y_real\"][\"FC4\"], \"C_fwd_Y_real_FC4\"),\n", " (dv[\"Y_real\"].flatten(), \"Y_real\"),\n", " (params[\"critic\"][\"FC1\"][\"W\"], \"C_weights_Y_fake_FC1\"),\n", " (params[\"critic\"][\"FC2\"][\"W\"], \"C_weights_Y_fake_FC2\"),\n", " (params[\"critic\"][\"FC3\"][\"W\"], \"C_weights_Y_fake_FC3\"),\n", " (params[\"critic\"][\"FC4\"][\"W\"], \"C_weights_Y_fake_FC4\"),\n", " (dv[\"C_fwd_Y_fake\"][\"FC1\"], \"C_fwd_Y_fake_FC1\"),\n", " (dv[\"C_fwd_Y_fake\"][\"FC2\"], \"C_fwd_Y_fake_FC2\"),\n", " (dv[\"C_fwd_Y_fake\"][\"FC3\"], \"C_fwd_Y_fake_FC3\"),\n", " (dv[\"C_fwd_Y_fake\"][\"FC4\"], \"C_fwd_Y_fake_FC4\"),\n", " (dv[\"Y_fake\"].flatten(), \"Y_fake\"),\n", " (params[\"critic\"][\"FC1\"][\"W\"], \"C_weights_Y_interp_FC1\"),\n", " (params[\"critic\"][\"FC2\"][\"W\"], \"C_weights_Y_interp_FC2\"),\n", " (params[\"critic\"][\"FC3\"][\"W\"], \"C_weights_Y_interp_FC3\"),\n", " (params[\"critic\"][\"FC4\"][\"W\"], \"C_weights_Y_interp_FC4\"),\n", " (dv[\"C_fwd_Y_interp\"][\"FC1\"], \"C_fwd_Y_interp_FC1\"),\n", " (dv[\"C_fwd_Y_interp\"][\"FC2\"], \"C_fwd_Y_interp_FC2\"),\n", " (dv[\"C_fwd_Y_interp\"][\"FC3\"], \"C_fwd_Y_interp_FC3\"),\n", " (dv[\"C_fwd_Y_interp\"][\"FC4\"], \"C_fwd_Y_interp_FC4\"),\n", " (dv[\"Y_interp\"].flatten(), \"Y_interp\"),\n", " (dv[\"C_dY_interp_wrt\"][\"FC4\"], \"dY_interp_wrt_FC4\"),\n", " (dv[\"C_dY_interp_wrt\"][\"FC3\"], \"dY_interp_wrt_FC3\"),\n", " (dv[\"C_dY_interp_wrt\"][\"FC2\"], \"dY_interp_wrt_FC2\"),\n", " (dv[\"C_dY_interp_wrt\"][\"FC1\"], \"dY_interp_wrt_FC1\"),\n", " (dv[\"gradInterp\"], \"gradInterp\"),\n", " (dv[\"C_loss\"], \"C_loss\"),\n", " (dv[\"G_loss\"], \"G_loss\"),\n", " (grads[\"critic\"][\"FC1\"][\"W\"], \"dC_loss_dW_FC1\"),\n", " (grads[\"critic\"][\"FC1\"][\"b\"].flatten(), \"dC_loss_db_FC1\"),\n", " (grads[\"critic\"][\"FC2\"][\"W\"], \"dC_loss_dW_FC2\"),\n", " (grads[\"critic\"][\"FC2\"][\"b\"].flatten(), \"dC_loss_db_FC2\"),\n", " (grads[\"critic\"][\"FC3\"][\"W\"], \"dC_loss_dW_FC3\"),\n", " (grads[\"critic\"][\"FC3\"][\"b\"].flatten(), \"dC_loss_db_FC3\"),\n", " (grads[\"critic\"][\"FC4\"][\"W\"], \"dC_loss_dW_FC4\"),\n", " (grads[\"critic\"][\"FC4\"][\"b\"].flatten(), \"dC_loss_db_FC4\"),\n", " (dv[\"dG_Y_fake\"].flatten(), \"dG_Y_fake\"),\n", " (dv[\"dY_real\"].flatten(), \"dC_Y_real\"),\n", " (dv[\"dC_Y_fake\"].flatten(), \"dC_Y_fake\"),\n", " (dv[\"dGrad_interp\"], \"dC_gradInterp\"),\n", " ]\n", "\n", " print(\"\\nTrial {}\".format(i))\n", " print(\"Seed: {} g_hidden={}\".format(ss, g_hidden))\n", " print(\"lambda={} n_ex={} n_in={}\".format(lambda_, n_ex, n_in))\n", " print(\n", " \"c_updates_per_epoch={}, n_steps={} batchsize={}\".format(\n", " c_updates_per_epoch, n_steps, batchsize\n", " )\n", " )\n", "\n", " for ix, (mine, label) in enumerate(params):\n", " np.testing.assert_almost_equal(\n", " mine, golds[label], err_msg=err_fmt(params, golds, ix), decimal=3\n", " )\n", " print(\"\\tPASSED {}\".format(label))\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/test_trees.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn.datasets import make_regression, make_blobs\n", "from sklearn.model_selection import train_test_split\n", "\n", "from numpy_ml.trees.gbdt import GradientBoostedDecisionTree\n", "from numpy_ml.trees.dt import DecisionTree, Node, Leaf\n", "from numpy_ml.trees.rf import RandomForest\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def clone_tree(dtree):\n", " children_left = dtree.tree_.children_left\n", " children_right = dtree.tree_.children_right\n", " feature = dtree.tree_.feature\n", " threshold = dtree.tree_.threshold\n", " values = dtree.tree_.value\n", "\n", " def grow(node_id):\n", " l, r = children_left[node_id], children_right[node_id]\n", " if l == r:\n", " return Leaf(values[node_id].argmax())\n", " n = Node(None, None, (feature[node_id], threshold[node_id]))\n", " n.left = grow(l)\n", " n.right = grow(r)\n", " return n\n", "\n", " node_id = 0\n", " root = Node(None, None, (feature[node_id], threshold[node_id]))\n", " root.left = grow(children_left[node_id])\n", " root.right = grow(children_right[node_id])\n", " return root\n", "\n", "\n", "def compare_trees(mine, gold):\n", " clone = clone_tree(gold)\n", " mine = mine.root\n", "\n", " def test(mine, clone):\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature, \"Node {} not equal\".format(depth)\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", " depth = 0\n", " ok = True\n", " while ok:\n", " if isinstance(clone, Node) and isinstance(mine, Node):\n", " assert mine.feature == clone.feature\n", " np.testing.assert_allclose(mine.threshold, clone.threshold)\n", " test(mine.left, clone.left, depth + 1)\n", " test(mine.right, clone.right, depth + 1)\n", " elif isinstance(clone, Leaf) and isinstance(mine, Leaf):\n", " np.testing.assert_allclose(mine.value, clone.value)\n", " return\n", " else:\n", " raise ValueError(\"Nodes at depth {} are not equal\".format(depth))\n", "\n", "\n", "def test_DecisionTree(N=1):\n", " i = 1\n", " np.random.seed(12345)\n", " while i <= N:\n", " n_ex = np.random.randint(2, 100)\n", " n_feats = np.random.randint(2, 100)\n", " max_depth = np.random.randint(1, 5)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i\n", " )\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return 1 - accuracy_score(yp, y)\n", "\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = DecisionTree(\n", " classifier=classifier, max_depth=max_depth, criterion=criterion\n", " )\n", " gold = DecisionTreeClassifier(\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " splitter=\"best\",\n", " random_state=i,\n", " )\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = DecisionTree(\n", " criterion=criterion, max_depth=max_depth, classifier=classifier\n", " )\n", " gold = DecisionTreeRegressor(\n", " criterion=criterion, max_depth=max_depth, splitter=\"best\"\n", " )\n", "\n", " print(\"Trial {}\".format(i))\n", " print(\"\\tClassifier={}, criterion={}\".format(classifier, criterion))\n", " print(\"\\tmax_depth={}, n_feats={}, n_ex={}\".format(max_depth, n_feats, n_ex))\n", " if classifier:\n", " print(\"\\tn_classes: {}\".format(n_classes))\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " gold.fit(X, Y)\n", "\n", " # get preds on training set\n", " y_pred_mine = mine.predict(X)\n", " y_pred_gold = gold.predict(X)\n", "\n", " loss_mine = loss(y_pred_mine, Y)\n", " loss_gold = loss(y_pred_gold, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_gold_test = gold.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_gold_test = loss(y_pred_gold_test, Y_test)\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine, loss_gold)\n", " print(\"\\tLoss on training: {}\".format(loss_mine))\n", " except AssertionError as e:\n", " print(\"\\tTraining losses not equal:\\n{}\".format(e))\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)\n", " print(\"\\tLoss on test: {}\".format(loss_mine_test))\n", " except AssertionError as e:\n", " print(\"\\tTest losses not equal:\\n{}\".format(e))\n", " i += 1\n", "\n", "\n", "def test_RandomForest(N=1):\n", " np.random.seed(12345)\n", " i = 1\n", " while i <= N:\n", " n_ex = np.random.randint(2, 100)\n", " n_feats = np.random.randint(2, 100)\n", " n_trees = np.random.randint(2, 100)\n", " max_depth = np.random.randint(1, 5)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i\n", " )\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return 1 - accuracy_score(yp, y)\n", "\n", " # initialize model\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = RandomForest(\n", " classifier=classifier,\n", " n_feats=n_feats,\n", " n_trees=n_trees,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " )\n", " gold = RandomForestClassifier(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = RandomForest(\n", " criterion=criterion,\n", " n_feats=n_feats,\n", " n_trees=n_trees,\n", " max_depth=max_depth,\n", " classifier=classifier,\n", " )\n", " gold = RandomForestRegressor(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", "\n", " print(\"Trial {}\".format(i))\n", " print(\"\\tClassifier={}, criterion={}\".format(classifier, criterion))\n", " print(\"\\tmax_depth={}, n_feats={}, n_ex={}\".format(max_depth, n_feats, n_ex))\n", " if classifier:\n", " print(\"\\tn_classes: {}\".format(n_classes))\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " gold.fit(X, Y)\n", "\n", " # get preds\n", " y_pred_mine = mine.predict(X)\n", " y_pred_gold = gold.predict(X)\n", "\n", " loss_mine = loss(y_pred_mine, Y)\n", " loss_gold = loss(y_pred_gold, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_gold_test = gold.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_gold_test = loss(y_pred_gold_test, Y_test)\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine, loss_gold)\n", " print(\"\\tLoss on training: {}\".format(loss_mine))\n", " except AssertionError as e:\n", " print(\"\\tTraining losses not equal:\\n{}\".format(e))\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)\n", " print(\"\\tLoss on test: {}\".format(loss_mine_test))\n", " except AssertionError as e:\n", " print(\"\\tTest losses not equal:\\n{}\".format(e))\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_gbdt(N=1):\n", " np.random.seed(12345)\n", " i = 1\n", " while i <= N:\n", " n_ex = np.random.randint(2, 100)\n", " n_feats = np.random.randint(2, 100)\n", " n_trees = np.random.randint(2, 100)\n", " max_depth = np.random.randint(1, 5)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i\n", " )\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return 1 - accuracy_score(yp, y)\n", "\n", " # initialize model\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = GradientBoostedDecisionTree(\n", " n_iter=n_trees,\n", " classifier=classifier,\n", " max_depth=max_depth,\n", " learning_rate=0.1,\n", " loss=\"crossentropy\",\n", " step_size=\"constant\",\n", " )\n", " gold = RandomForestClassifier(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = GradientBoostedDecisionTree(\n", " n_iter=n_trees,\n", " max_depth=max_depth,\n", " classifier=classifier,\n", " learning_rate=0.1,\n", " loss=\"mse\",\n", " step_size=\"constant\",\n", " )\n", " gold = RandomForestRegressor(\n", " n_estimators=n_trees,\n", " max_features=n_feats,\n", " criterion=criterion,\n", " max_depth=max_depth,\n", " bootstrap=True,\n", " )\n", "\n", " print(\"Trial {}\".format(i))\n", " print(\"\\tClassifier={}, criterion={}\".format(classifier, criterion))\n", " print(\"\\tmax_depth={}, n_feats={}, n_ex={}\".format(max_depth, n_feats, n_ex))\n", " if classifier:\n", " print(\"\\tn_classes: {}\".format(n_classes))\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " gold.fit(X, Y)\n", "\n", " # get preds\n", " y_pred_mine = mine.predict(X)\n", " y_pred_gold = gold.predict(X)\n", "\n", " loss_mine = loss(y_pred_mine, Y)\n", " loss_gold = loss(y_pred_gold, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_gold_test = gold.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_gold_test = loss(y_pred_gold_test, Y_test)\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine, loss_gold)\n", " print(\"\\tLoss on training: {}\".format(loss_mine))\n", " except AssertionError as e:\n", " print(\"\\tTraining losses not equal:\\n{}\".format(e))\n", "\n", " try:\n", " np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)\n", " print(\"\\tLoss on test: {}\".format(loss_mine_test))\n", " except AssertionError as e:\n", " print(\"\\tTest losses not equal:\\n{}\".format(e))\n", "\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/test_preprocessing.py", "content": ["# flake8: noqa\n", "from collections import Counter\n", "\n", "# gold-standard imports\n", "import huffman\n", "import numpy as np\n", "\n", "from scipy.fftpack import dct\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "try:\n", " from librosa.core.time_frequency import fft_frequencies\n", "except ImportError:\n", " # for librosa >= 0.8.0\n", " from librosa import fft_frequencies\n", "from librosa.feature import mfcc as lr_mfcc\n", "from librosa.util import frame\n", "from librosa.filters import mel\n", "\n", "# numpy-ml implementations\n", "from numpy_ml.preprocessing.general import Standardizer\n", "from numpy_ml.preprocessing.nlp import HuffmanEncoder, TFIDFEncoder\n", "from numpy_ml.preprocessing.dsp import (\n", " DCT,\n", " DFT,\n", " mfcc,\n", " to_frames,\n", " mel_filterbank,\n", " dft_bins,\n", ")\n", "from numpy_ml.utils.testing import random_paragraph\n", "\n", "\n", "def test_huffman(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " n_words = np.random.randint(1, 100)\n", " para = random_paragraph(n_words)\n", " HT = HuffmanEncoder()\n", " HT.fit(para)\n", " my_dict = HT._item2code\n", " their_dict = huffman.codebook(Counter(para).items())\n", "\n", " for k, v in their_dict.items():\n", " fstr = \"their_dict['{}'] = {}, but my_dict['{}'] = {}\"\n", " assert k in my_dict, \"key `{}` not in my_dict\".format(k)\n", " assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k])\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_standardizer(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " mean = bool(np.random.randint(2))\n", " std = bool(np.random.randint(2))\n", " N = np.random.randint(2, 100)\n", " M = np.random.randint(2, 100)\n", " X = np.random.rand(N, M)\n", "\n", " S = Standardizer(with_mean=mean, with_std=std)\n", " S.fit(X)\n", " mine = S.transform(X)\n", "\n", " theirs = StandardScaler(with_mean=mean, with_std=std)\n", " gold = theirs.fit_transform(X)\n", "\n", " np.testing.assert_almost_equal(mine, gold)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_tfidf(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " docs = []\n", " n_docs = np.random.randint(1, 10)\n", " for d in range(n_docs):\n", " n_lines = np.random.randint(1, 1000)\n", " lines = [random_paragraph(np.random.randint(1, 10)) for _ in range(n_lines)]\n", " docs.append(\"\\n\".join([\" \".join(l) for l in lines]))\n", "\n", " smooth = bool(np.random.randint(2))\n", "\n", " tfidf = TFIDFEncoder(\n", " lowercase=True,\n", " min_count=0,\n", " smooth_idf=smooth,\n", " max_tokens=None,\n", " input_type=\"strings\",\n", " filter_stopwords=False,\n", " )\n", " gold = TfidfVectorizer(\n", " input=\"content\",\n", " norm=None,\n", " use_idf=True,\n", " lowercase=True,\n", " smooth_idf=smooth,\n", " sublinear_tf=False,\n", " )\n", "\n", " tfidf.fit(docs)\n", " mine = tfidf.transform(ignore_special_chars=True)\n", " theirs = gold.fit_transform(docs).toarray()\n", "\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_dct(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " signal = np.random.rand(N)\n", " ortho = bool(np.random.randint(2))\n", " mine = DCT(signal, orthonormal=ortho)\n", " theirs = dct(signal, norm=\"ortho\" if ortho else None)\n", "\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_dft(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(2, 100)\n", " signal = np.random.rand(N)\n", " mine = DFT(signal)\n", " theirs = np.fft.rfft(signal)\n", "\n", " np.testing.assert_almost_equal(mine.real, theirs.real)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_mfcc(N=1):\n", " \"\"\"Broken\"\"\"\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(500, 1000)\n", " fs = np.random.randint(50, 100)\n", " n_mfcc = 12\n", " window_len = 100\n", " stride_len = 50\n", " n_filters = 20\n", " window_dur = window_len / fs\n", " stride_dur = stride_len / fs\n", " signal = np.random.rand(N)\n", "\n", " mine = mfcc(\n", " signal,\n", " fs=fs,\n", " window=\"hann\",\n", " window_duration=window_dur,\n", " stride_duration=stride_dur,\n", " lifter_coef=0,\n", " alpha=0,\n", " n_mfccs=n_mfcc,\n", " normalize=False,\n", " center=True,\n", " n_filters=n_filters,\n", " replace_intercept=False,\n", " )\n", "\n", " theirs = lr_mfcc(\n", " signal,\n", " sr=fs,\n", " n_mels=n_filters,\n", " n_mfcc=n_mfcc,\n", " n_fft=window_len,\n", " hop_length=stride_len,\n", " htk=True,\n", " ).T\n", "\n", " np.testing.assert_almost_equal(mine, theirs, decimal=4)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_framing(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(500, 100000)\n", " window_len = np.random.randint(10, 100)\n", " stride_len = np.random.randint(1, 50)\n", " signal = np.random.rand(N)\n", "\n", " mine = to_frames(signal, window_len, stride_len, writeable=False)\n", " theirs = frame(signal, frame_length=window_len, hop_length=stride_len).T\n", "\n", " assert len(mine) == len(theirs), \"len(mine) = {}, len(theirs) = {}\".format(\n", " len(mine), len(theirs)\n", " )\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_dft_bins(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " N = np.random.randint(500, 100000)\n", " fs = np.random.randint(50, 1000)\n", "\n", " mine = dft_bins(N, fs=fs, positive_only=True)\n", " theirs = fft_frequencies(fs, N)\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_mel_filterbank(N=15):\n", " np.random.seed(12345)\n", "\n", " i = 0\n", " while i < N:\n", " fs = np.random.randint(50, 10000)\n", " n_filters = np.random.randint(2, 20)\n", " window_len = np.random.randint(10, 100)\n", " norm = np.random.randint(2)\n", "\n", " mine = mel_filterbank(\n", " window_len, n_filters, fs, min_freq=0, max_freq=None, normalize=bool(norm)\n", " )\n", "\n", " theirs = mel(\n", " fs,\n", " n_fft=window_len,\n", " n_mels=n_filters,\n", " htk=True,\n", " norm=\"slaney\" if norm == 1 else None,\n", " )\n", "\n", " np.testing.assert_almost_equal(mine, theirs)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/test_ngram.py", "content": ["# flake8: noqa\n", "import tempfile\n", "\n", "import nltk\n", "import numpy as np\n", "\n", "from ..preprocessing.nlp import tokenize_words\n", "from ..ngram import AdditiveNGram, MLENGram\n", "from ..utils.testing import random_paragraph\n", "\n", "\n", "class MLEGold:\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True\n", " ):\n", " self.N = N\n", " self.K = K\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"K\": K,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " N = self.N\n", " H = self.hyperparameters\n", " models, counts = {}, {}\n", " grams = {n: [] for n in range(1, N + 1)}\n", " gg = {n: [] for n in range(1, N + 1)}\n", " filter_punc, filter_stop = H[\"filter_punctuation\"], H[\"filter_stopwords\"]\n", "\n", " n_words = 0\n", " tokens = set([])\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " words = tokenize_words(line, filter_punc, filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for n in range(1, N + 1):\n", " grams[n].append(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", "\n", " gg[n].extend(\n", " list(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", " )\n", "\n", " for n in range(1, N + 1):\n", " counts[n] = nltk.FreqDist(gg[n])\n", " models[n] = nltk.lm.MLE(order=n)\n", " models[n].fit(grams[n], tokens)\n", "\n", " self.counts = counts\n", " self.n_words = n_words\n", " self._models = models\n", " self.n_tokens = len(vocab) if vocab is not None else len(tokens)\n", "\n", " def log_prob(self, words, N):\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in nltk.ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _log_ngram_prob(self, ngram):\n", " N = len(ngram)\n", " return self._models[N].logscore(ngram[-1], ngram[:-1])\n", "\n", "\n", "class AdditiveGold:\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True\n", " ):\n", " self.N = N\n", " self.K = K\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"K\": K,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " N = self.N\n", " H = self.hyperparameters\n", " models, counts = {}, {}\n", " grams = {n: [] for n in range(1, N + 1)}\n", " gg = {n: [] for n in range(1, N + 1)}\n", " filter_punc, filter_stop = H[\"filter_punctuation\"], H[\"filter_stopwords\"]\n", "\n", " n_words = 0\n", " tokens = set()\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " words = tokenize_words(line, filter_punc, filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for n in range(1, N + 1):\n", " grams[n].append(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", "\n", " gg[n].extend(\n", " list(\n", " nltk.ngrams(\n", " words,\n", " n,\n", " pad_left=True,\n", " pad_right=True,\n", " left_pad_symbol=\"\",\n", " right_pad_symbol=\"\",\n", " )\n", " )\n", " )\n", "\n", " for n in range(1, N + 1):\n", " counts[n] = nltk.FreqDist(gg[n])\n", " models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)\n", " models[n].fit(grams[n], tokens)\n", "\n", " self.counts = counts\n", " self._models = models\n", " self.n_words = n_words\n", " self.n_tokens = len(vocab) if vocab is not None else len(tokens)\n", "\n", " def log_prob(self, words, N):\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in nltk.ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _log_ngram_prob(self, ngram):\n", " N = len(ngram)\n", " return self._models[N].logscore(ngram[-1], ngram[:-1])\n", "\n", "\n", "def test_mle():\n", " N = np.random.randint(2, 5)\n", " gold = MLEGold(N, unk=True, filter_stopwords=False, filter_punctuation=False)\n", " mine = MLENGram(N, unk=True, filter_stopwords=False, filter_punctuation=False)\n", "\n", " with tempfile.NamedTemporaryFile() as temp:\n", " temp.write(bytes(\" \".join(random_paragraph(1000)), encoding=\"utf-8-sig\"))\n", " gold.train(temp.name, encoding=\"utf-8-sig\")\n", " mine.train(temp.name, encoding=\"utf-8-sig\")\n", "\n", " for k in mine.counts[N].keys():\n", " if k[0] == k[1] and k[0] in (\"\", \"\"):\n", " continue\n", "\n", " err_str = \"{}, mine: {}, gold: {}\"\n", " assert mine.counts[N][k] == gold.counts[N][k], err_str.format(\n", " k, mine.counts[N][k], gold.counts[N][k]\n", " )\n", "\n", " M = mine.log_prob(k, N)\n", " G = gold.log_prob(k, N) / np.log2(np.e) # convert to log base e\n", " np.testing.assert_allclose(M, G)\n", " print(\"PASSED\")\n", "\n", "\n", "def test_additive():\n", " K = np.random.rand()\n", " N = np.random.randint(2, 5)\n", " gold = AdditiveGold(\n", " N, K, unk=True, filter_stopwords=False, filter_punctuation=False\n", " )\n", " mine = AdditiveNGram(\n", " N, K, unk=True, filter_stopwords=False, filter_punctuation=False\n", " )\n", "\n", " with tempfile.NamedTemporaryFile() as temp:\n", " temp.write(bytes(\" \".join(random_paragraph(1000)), encoding=\"utf-8-sig\"))\n", " gold.train(temp.name, encoding=\"utf-8-sig\")\n", " mine.train(temp.name, encoding=\"utf-8-sig\")\n", "\n", " for k in mine.counts[N].keys():\n", " if k[0] == k[1] and k[0] in (\"\", \"\"):\n", " continue\n", "\n", " err_str = \"{}, mine: {}, gold: {}\"\n", " assert mine.counts[N][k] == gold.counts[N][k], err_str.format(\n", " k, mine.counts[N][k], gold.counts[N][k]\n", " )\n", "\n", " M = mine.log_prob(k, N)\n", " G = gold.log_prob(k, N) / np.log2(np.e) # convert to log base e\n", " np.testing.assert_allclose(M, G)\n", " print(\"PASSED\")\n"]} {"path": "numpy_ml/o_tests/test_naive_bayes.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.model_selection import train_test_split\n", "\n", "from sklearn import naive_bayes\n", "\n", "from numpy_ml.linear_models import GaussianNBClassifier\n", "from numpy_ml.utils.testing import random_tensor\n", "\n", "\n", "def test_GaussianNB(N=10):\n", " np.random.seed(12345)\n", " N = np.inf if N is None else N\n", "\n", " i = 1\n", " eps = np.finfo(float).eps\n", " while i < N + 1:\n", " n_ex = np.random.randint(1, 300)\n", " n_feats = np.random.randint(1, 100)\n", " n_classes = np.random.randint(2, 10)\n", "\n", " X = random_tensor((n_ex, n_feats), standardize=True)\n", " y = np.random.randint(0, n_classes, size=n_ex)\n", "\n", " X_test = random_tensor((n_ex, n_feats), standardize=True)\n", "\n", " NB = GaussianNBClassifier(eps=1e-09)\n", " NB.fit(X, y)\n", "\n", " preds = NB.predict(X_test)\n", "\n", " sklearn_NB = naive_bayes.GaussianNB()\n", " sklearn_NB.fit(X, y)\n", "\n", " sk_preds = sklearn_NB.predict(X_test)\n", "\n", " for j in range(len(NB.labels)):\n", " P = NB.parameters\n", " jointi = np.log(sklearn_NB.class_prior_[j])\n", " jointi_mine = np.log(P[\"prior\"][j])\n", "\n", " np.testing.assert_almost_equal(jointi, jointi_mine)\n", "\n", " n_jk = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[j, :] + eps))\n", " n_jk_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P[\"sigma\"][j] + eps))\n", "\n", " np.testing.assert_almost_equal(n_jk_mine, n_jk)\n", "\n", " n_jk2 = n_jk - 0.5 * np.sum(\n", " ((X_test - sklearn_NB.theta_[j, :]) ** 2) / (sklearn_NB.sigma_[j, :]), 1\n", " )\n", "\n", " n_jk2_mine = n_jk_mine - 0.5 * np.sum(\n", " ((X_test - P[\"mean\"][j]) ** 2) / (P[\"sigma\"][j]), 1\n", " )\n", " np.testing.assert_almost_equal(n_jk2_mine, n_jk2, decimal=4)\n", "\n", " llh = jointi + n_jk2\n", " llh_mine = jointi_mine + n_jk2_mine\n", "\n", " np.testing.assert_almost_equal(llh_mine, llh, decimal=4)\n", "\n", " np.testing.assert_almost_equal(P[\"prior\"], sklearn_NB.class_prior_)\n", " np.testing.assert_almost_equal(P[\"mean\"], sklearn_NB.theta_)\n", " np.testing.assert_almost_equal(P[\"sigma\"], sklearn_NB.sigma_)\n", " np.testing.assert_almost_equal(\n", " sklearn_NB._joint_log_likelihood(X_test),\n", " NB._log_posterior(X_test),\n", " decimal=4,\n", " )\n", " np.testing.assert_almost_equal(preds, sk_preds)\n", " print(\"PASSED\")\n", " i += 1\n"]} {"path": "numpy_ml/o_tests/nn_torch_models.py", "content": ["# flake8: noqa\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "import tensorflow as tf\n", "\n", "import numpy as np\n", "\n", "#######################################################################\n", "# Gold-standard implementations for testing custom layers #\n", "# (Requires Pytorch) #\n", "#######################################################################\n", "\n", "\n", "def torchify(var, requires_grad=True):\n", " return torch.autograd.Variable(torch.FloatTensor(var), requires_grad=requires_grad)\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "def torch_xe_grad(y, z):\n", " z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)\n", " y = torch.LongTensor(y.argmax(axis=1))\n", " loss = F.cross_entropy(z, y, reduction=\"sum\")\n", " loss.backward()\n", " grad = z.grad.numpy()\n", " return grad\n", "\n", "\n", "def torch_mse_grad(y, z, act_fn):\n", " y = torch.FloatTensor(y)\n", " z = torch.autograd.Variable(torch.FloatTensor(z), requires_grad=True)\n", " y_pred = act_fn(z)\n", " loss = F.mse_loss(y_pred, y, reduction=\"sum\") # size_average=False).sum()\n", " loss.backward()\n", " grad = z.grad.numpy()\n", " return grad\n", "\n", "\n", "class TorchVAELoss(nn.Module):\n", " def __init__(self):\n", " super(TorchVAELoss, self).__init__()\n", "\n", " def extract_grads(self, X, X_recon, t_mean, t_log_var):\n", " eps = np.finfo(float).eps\n", " X = torchify(X, requires_grad=False)\n", " X_recon = torchify(np.clip(X_recon, eps, 1 - eps))\n", " t_mean = torchify(t_mean)\n", " t_log_var = torchify(t_log_var)\n", "\n", " BCE = torch.sum(F.binary_cross_entropy(X_recon, X, reduction=\"none\"), dim=1)\n", "\n", " # see Appendix B from VAE paper:\n", " # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014\n", " # https://arxiv.org/abs/1312.6114\n", " # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)\n", " KLD = -0.5 * torch.sum(1 + t_log_var - t_mean.pow(2) - t_log_var.exp(), dim=1)\n", "\n", " loss = torch.mean(BCE + KLD)\n", " loss.backward()\n", "\n", " grads = {\n", " \"loss\": loss.detach().numpy(),\n", " \"dX_recon\": X_recon.grad.numpy(),\n", " \"dt_mean\": t_mean.grad.numpy(),\n", " \"dt_log_var\": t_log_var.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchWGANGPLoss(nn.Module):\n", " def __init__(self, lambda_=10):\n", " self.lambda_ = torchify([lambda_])\n", " super(TorchWGANGPLoss, self).__init__()\n", "\n", " def forward(self, Y_real, Y_fake, gradInterp):\n", " GY_fake = Y_fake.copy()\n", " self.Y_real = torchify(Y_real)\n", " self.Y_fake = torchify(Y_fake)\n", " self.GY_fake = torchify(GY_fake)\n", " self.gradInterp = torchify(gradInterp)\n", "\n", " # calc grad penalty\n", " norm = self.gradInterp.norm(2, dim=1)\n", " self.norm1 = torch.sqrt(torch.sum(self.gradInterp.pow(2), dim=1))\n", " assert torch.allclose(norm, self.norm1)\n", "\n", " self.gpenalty = self.lambda_ * ((self.norm1 - 1).pow(2)).mean()\n", " self.C_loss = self.Y_fake.mean() - self.Y_real.mean() + self.gpenalty\n", " self.G_loss = -self.GY_fake.mean()\n", "\n", " def extract_grads(self, Y_real, Y_fake, gradInterp):\n", " self.forward(Y_real, Y_fake, gradInterp)\n", "\n", " self.C_loss.backward()\n", " self.G_loss.backward()\n", "\n", " grads = {\n", " \"Y_real\": self.Y_real.detach().numpy(),\n", " \"Y_fake\": self.Y_fake.detach().numpy(),\n", " \"gradInterp\": self.gradInterp.detach().numpy(),\n", " \"GP\": self.gpenalty.detach().numpy(),\n", " \"C_loss\": self.C_loss.detach().numpy(),\n", " \"G_loss\": self.G_loss.detach().numpy(),\n", " \"C_dY_real\": self.Y_real.grad.numpy(),\n", " \"C_dGradInterp\": self.gradInterp.grad.numpy(),\n", " \"C_dY_fake\": self.Y_fake.grad.numpy(),\n", " \"G_dY_fake\": self.GY_fake.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchLinearActivation(nn.Module):\n", " def __init__(self):\n", " super(TorchLinearActivation, self).__init__()\n", " pass\n", "\n", " @staticmethod\n", " def forward(input):\n", " return input\n", "\n", " @staticmethod\n", " def backward(grad_output):\n", " return torch.ones_like(grad_output)\n", "\n", "\n", "class TorchBatchNormLayer(nn.Module):\n", " def __init__(self, n_in, params, mode, momentum=0.9, epsilon=1e-5):\n", " super(TorchBatchNormLayer, self).__init__()\n", "\n", " scaler = params[\"scaler\"]\n", " intercept = params[\"intercept\"]\n", "\n", " if mode == \"1D\":\n", " self.layer1 = nn.BatchNorm1d(\n", " num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True\n", " )\n", " elif mode == \"2D\":\n", " self.layer1 = nn.BatchNorm2d(\n", " num_features=n_in, momentum=1 - momentum, eps=epsilon, affine=True\n", " )\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " if X.ndim == 4:\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", "\n", " if not isinstance(X, torch.Tensor):\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.Y = self.layer1(self.X)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X, Y_true=None):\n", " self.forward(X)\n", "\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])\n", " self.loss1 = (\n", " 0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()\n", " )\n", " else:\n", " self.loss1 = self.Y.sum()\n", "\n", " self.loss1.backward()\n", "\n", " X_np = self.X.detach().numpy()\n", " Y_np = self.Y.detach().numpy()\n", " dX_np = self.X.grad.numpy()\n", " dY_np = self.Y.grad.numpy()\n", "\n", " if self.X.dim() == 4:\n", " orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, orig, X_swap)\n", " X_np = np.moveaxis(X_np, orig, X_swap)\n", " Y_np = np.moveaxis(Y_np, orig, X_swap)\n", " dX_np = np.moveaxis(dX_np, orig, X_swap)\n", " dY_np = np.moveaxis(dY_np, orig, X_swap)\n", "\n", " grads = {\n", " \"loss\": self.loss1.detach().numpy(),\n", " \"X\": X_np,\n", " \"momentum\": 1 - self.layer1.momentum,\n", " \"epsilon\": self.layer1.eps,\n", " \"intercept\": self.layer1.bias.detach().numpy(),\n", " \"scaler\": self.layer1.weight.detach().numpy(),\n", " \"running_mean\": self.layer1.running_mean.detach().numpy(),\n", " \"running_var\": self.layer1.running_var.detach().numpy(),\n", " \"y\": Y_np,\n", " \"dLdy\": dY_np,\n", " \"dLdIntercept\": self.layer1.bias.grad.numpy(),\n", " \"dLdScaler\": self.layer1.weight.grad.numpy(),\n", " \"dLdX\": dX_np,\n", " }\n", " if isinstance(Y_true, np.ndarray):\n", " grads[\"Y_true\"] = Y_true\n", " return grads\n", "\n", "\n", "class TorchLayerNormLayer(nn.Module):\n", " def __init__(self, feat_dims, params, mode, epsilon=1e-5):\n", " super(TorchLayerNormLayer, self).__init__()\n", "\n", " self.layer1 = nn.LayerNorm(\n", " normalized_shape=feat_dims, eps=epsilon, elementwise_affine=True\n", " )\n", "\n", " scaler = params[\"scaler\"]\n", " intercept = params[\"intercept\"]\n", "\n", " if mode == \"2D\":\n", " scaler = np.moveaxis(scaler, [0, 1, 2], [-2, -1, -3])\n", " intercept = np.moveaxis(intercept, [0, 1, 2], [-2, -1, -3])\n", "\n", " assert scaler.shape == self.layer1.weight.shape\n", " assert intercept.shape == self.layer1.bias.shape\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " if X.ndim == 4:\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", "\n", " if not isinstance(X, torch.Tensor):\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.Y = self.layer1(self.X)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X, Y_true=None):\n", " self.forward(X)\n", "\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, [0, 1, 2, 3], [0, -2, -1, -3])\n", " self.loss1 = (\n", " 0.5 * F.mse_loss(self.Y, torchify(Y_true), size_average=False).sum()\n", " )\n", " else:\n", " self.loss1 = self.Y.sum()\n", "\n", " self.loss1.backward()\n", "\n", " X_np = self.X.detach().numpy()\n", " Y_np = self.Y.detach().numpy()\n", " dX_np = self.X.grad.numpy()\n", " dY_np = self.Y.grad.numpy()\n", " intercept_np = self.layer1.bias.detach().numpy()\n", " scaler_np = self.layer1.weight.detach().numpy()\n", " dIntercept_np = self.layer1.bias.grad.numpy()\n", " dScaler_np = self.layer1.weight.grad.numpy()\n", "\n", " if self.X.dim() == 4:\n", " orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]\n", " orig_p, p_swap = [0, 1, 2], [-1, -3, -2]\n", " if isinstance(Y_true, np.ndarray):\n", " Y_true = np.moveaxis(Y_true, orig, X_swap)\n", " X_np = np.moveaxis(X_np, orig, X_swap)\n", " Y_np = np.moveaxis(Y_np, orig, X_swap)\n", " dX_np = np.moveaxis(dX_np, orig, X_swap)\n", " dY_np = np.moveaxis(dY_np, orig, X_swap)\n", " scaler_np = np.moveaxis(scaler_np, orig_p, p_swap)\n", " intercept_np = np.moveaxis(intercept_np, orig_p, p_swap)\n", " dScaler_np = np.moveaxis(dScaler_np, orig_p, p_swap)\n", " dIntercept_np = np.moveaxis(dIntercept_np, orig_p, p_swap)\n", "\n", " grads = {\n", " \"loss\": self.loss1.detach().numpy(),\n", " \"X\": X_np,\n", " \"epsilon\": self.layer1.eps,\n", " \"intercept\": intercept_np,\n", " \"scaler\": scaler_np,\n", " \"y\": Y_np,\n", " \"dLdy\": dY_np,\n", " \"dLdIntercept\": dIntercept_np,\n", " \"dLdScaler\": dScaler_np,\n", " \"dLdX\": dX_np,\n", " }\n", " if isinstance(Y_true, np.ndarray):\n", " grads[\"Y_true\"] = Y_true\n", " return grads\n", "\n", "\n", "class TorchAddLayer(nn.Module):\n", " def __init__(self, act_fn, **kwargs):\n", " super(TorchAddLayer, self).__init__()\n", " self.act_fn = act_fn\n", "\n", " def forward(self, Xs):\n", " self.Xs = []\n", " x = Xs[0].copy()\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " self.sum = x.clone()\n", " x.retain_grad()\n", " self.Xs.append(x)\n", "\n", " for i in range(1, len(Xs)):\n", " x = Xs[i]\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " x.retain_grad()\n", " self.Xs.append(x)\n", " self.sum += x\n", "\n", " self.sum.retain_grad()\n", " self.Y = self.act_fn(self.sum)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", " grads = {\n", " \"Xs\": X,\n", " \"Sum\": self.sum.detach().numpy(),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dLdY\": self.Y.grad.numpy(),\n", " \"dLdSum\": self.sum.grad.numpy(),\n", " }\n", " grads.update(\n", " {\"dLdX{}\".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}\n", " )\n", " return grads\n", "\n", "\n", "class TorchMultiplyLayer(nn.Module):\n", " def __init__(self, act_fn, **kwargs):\n", " super(TorchMultiplyLayer, self).__init__()\n", " self.act_fn = act_fn\n", "\n", " def forward(self, Xs):\n", " self.Xs = []\n", " x = Xs[0].copy()\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " self.prod = x.clone()\n", " x.retain_grad()\n", " self.Xs.append(x)\n", "\n", " for i in range(1, len(Xs)):\n", " x = Xs[i]\n", " if not isinstance(x, torch.Tensor):\n", " x = torchify(x)\n", "\n", " x.retain_grad()\n", " self.Xs.append(x)\n", " self.prod *= x\n", "\n", " self.prod.retain_grad()\n", " self.Y = self.act_fn(self.prod)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", " grads = {\n", " \"Xs\": X,\n", " \"Prod\": self.prod.detach().numpy(),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dLdY\": self.Y.grad.numpy(),\n", " \"dLdProd\": self.prod.grad.numpy(),\n", " }\n", " grads.update(\n", " {\"dLdX{}\".format(i + 1): xi.grad.numpy() for i, xi in enumerate(self.Xs)}\n", " )\n", " return grads\n", "\n", "\n", "class TorchSkipConnectionIdentity(nn.Module):\n", " def __init__(self, act_fn, pad1, pad2, params, hparams, momentum=0.9, epsilon=1e-5):\n", " super(TorchSkipConnectionIdentity, self).__init__()\n", "\n", " self.conv1 = nn.Conv2d(\n", " hparams[\"in_ch\"],\n", " hparams[\"out_ch\"],\n", " hparams[\"kernel_shape1\"],\n", " padding=pad1,\n", " stride=hparams[\"stride1\"],\n", " bias=True,\n", " )\n", "\n", " self.act_fn = act_fn\n", "\n", " self.batchnorm1 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " self.conv2 = nn.Conv2d(\n", " hparams[\"out_ch\"],\n", " hparams[\"out_ch\"],\n", " hparams[\"kernel_shape2\"],\n", " padding=pad2,\n", " stride=hparams[\"stride2\"],\n", " bias=True,\n", " )\n", "\n", " self.batchnorm2 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv1\"][\"W\"]\n", " b = params[\"components\"][\"conv1\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv1.weight.shape == W.shape\n", " assert self.conv1.bias.shape == b.flatten().shape\n", " self.conv1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm1\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm1\"][\"intercept\"]\n", " self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv2\"][\"W\"]\n", " b = params[\"components\"][\"conv2\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv2.weight.shape == W.shape\n", " assert self.conv2.bias.shape == b.flatten().shape\n", " self.conv2.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm2\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm2\"][\"intercept\"]\n", " self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " if not isinstance(X, torch.Tensor):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.X.retain_grad()\n", "\n", " self.conv1_out = self.conv1(self.X)\n", " self.conv1_out.retain_grad()\n", "\n", " self.act_fn1_out = self.act_fn(self.conv1_out)\n", " self.act_fn1_out.retain_grad()\n", "\n", " self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)\n", " self.batchnorm1_out.retain_grad()\n", "\n", " self.conv2_out = self.conv2(self.batchnorm1_out)\n", " self.conv2_out.retain_grad()\n", "\n", " self.batchnorm2_out = self.batchnorm2(self.conv2_out)\n", " self.batchnorm2_out.retain_grad()\n", "\n", " self.layer3_in = self.batchnorm2_out + self.X\n", " self.layer3_in.retain_grad()\n", "\n", " self.Y = self.act_fn(self.layer3_in)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]\n", " grads = {\n", " # layer parameters\n", " \"conv1_W\": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),\n", " \"conv1_b\": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn1_intercept\": self.batchnorm1.bias.detach().numpy(),\n", " \"bn1_scaler\": self.batchnorm1.weight.detach().numpy(),\n", " \"bn1_running_mean\": self.batchnorm1.running_mean.detach().numpy(),\n", " \"bn1_running_var\": self.batchnorm1.running_var.detach().numpy(),\n", " \"conv2_W\": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),\n", " \"conv2_b\": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn2_intercept\": self.batchnorm2.bias.detach().numpy(),\n", " \"bn2_scaler\": self.batchnorm2.weight.detach().numpy(),\n", " \"bn2_running_mean\": self.batchnorm2.running_mean.detach().numpy(),\n", " \"bn2_running_var\": self.batchnorm2.running_var.detach().numpy(),\n", " # layer inputs/outputs (forward step)\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"conv1_out\": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),\n", " \"act1_out\": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),\n", " \"bn1_out\": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),\n", " \"conv2_out\": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),\n", " \"bn2_out\": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),\n", " \"add_out\": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),\n", " \"Y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " # layer gradients (backward step)\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdAdd\": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),\n", " \"dLdBn2_out\": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv2_out\": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),\n", " \"dLdBn1_out\": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),\n", " \"dLdActFn1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " # layer parameter gradients (backward step)\n", " \"dLdBn2_intercept\": self.batchnorm2.bias.grad.numpy(),\n", " \"dLdBn2_scaler\": self.batchnorm2.weight.grad.numpy(),\n", " \"dLdConv2_W\": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv2_b\": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdBn1_intercept\": self.batchnorm1.bias.grad.numpy(),\n", " \"dLdBn1_scaler\": self.batchnorm1.weight.grad.numpy(),\n", " \"dLdConv1_W\": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv1_b\": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " }\n", " return grads\n", "\n", "\n", "class TorchCausalConv1d(torch.nn.Conv1d):\n", " \"\"\"https://github.com/pytorch/pytorch/issues/1333\n", "\n", " NB: this is only ensures that the convolution out length is the same as\n", " the input length IFF stride = 1. Otherwise, in/out lengths will differ.\n", " \"\"\"\n", "\n", " def __init__(\n", " self,\n", " in_channels,\n", " out_channels,\n", " kernel_size,\n", " stride=1,\n", " dilation=1,\n", " groups=1,\n", " bias=True,\n", " ):\n", " self.__padding = (kernel_size - 1) * dilation\n", "\n", " super(TorchCausalConv1d, self).__init__(\n", " in_channels,\n", " out_channels,\n", " kernel_size=kernel_size,\n", " stride=stride,\n", " padding=self.__padding,\n", " dilation=dilation,\n", " groups=groups,\n", " bias=bias,\n", " )\n", "\n", " def forward(self, input):\n", " result = super(TorchCausalConv1d, self).forward(input)\n", " if self.__padding != 0:\n", " return result[:, :, : -self.__padding]\n", " return result\n", "\n", "\n", "class TorchWavenetModule(nn.Module):\n", " def __init__(self, params, hparams, conv_1x1_pad):\n", " super(TorchWavenetModule, self).__init__()\n", " self.conv_dilation = TorchCausalConv1d(\n", " in_channels=hparams[\"components\"][\"conv_dilation\"][\"in_ch\"],\n", " out_channels=hparams[\"components\"][\"conv_dilation\"][\"out_ch\"],\n", " kernel_size=hparams[\"components\"][\"conv_dilation\"][\"kernel_width\"],\n", " stride=hparams[\"components\"][\"conv_dilation\"][\"stride\"],\n", " dilation=hparams[\"components\"][\"conv_dilation\"][\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " self.conv_1x1 = nn.Conv1d(\n", " in_channels=hparams[\"components\"][\"conv_1x1\"][\"in_ch\"],\n", " out_channels=hparams[\"components\"][\"conv_1x1\"][\"out_ch\"],\n", " kernel_size=hparams[\"components\"][\"conv_1x1\"][\"kernel_width\"],\n", " stride=hparams[\"components\"][\"conv_1x1\"][\"stride\"],\n", " padding=conv_1x1_pad,\n", " dilation=hparams[\"components\"][\"conv_1x1\"][\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " W = params[\"components\"][\"conv_dilation\"][\"W\"]\n", " b = params[\"components\"][\"conv_dilation\"][\"b\"]\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])\n", " self.conv_dilation.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv_dilation.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", " assert self.conv_dilation.weight.shape == W.shape\n", " assert self.conv_dilation.bias.shape == b.flatten().shape\n", "\n", " W = params[\"components\"][\"conv_1x1\"][\"W\"]\n", " b = params[\"components\"][\"conv_1x1\"][\"b\"]\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])\n", " self.conv_1x1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv_1x1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", " assert self.conv_1x1.weight.shape == W.shape\n", " assert self.conv_1x1.bias.shape == b.flatten().shape\n", "\n", " def forward(self, X_main, X_skip):\n", " # (N, W, C) -> (N, C, W)\n", " self.X_main = np.moveaxis(X_main, [0, 1, 2], [0, -1, -2])\n", " self.X_main = torchify(self.X_main)\n", " self.X_main.retain_grad()\n", "\n", " self.conv_dilation_out = self.conv_dilation(self.X_main)\n", " self.conv_dilation_out.retain_grad()\n", "\n", " self.tanh_out = torch.tanh(self.conv_dilation_out)\n", " self.sigm_out = torch.sigmoid(self.conv_dilation_out)\n", "\n", " self.tanh_out.retain_grad()\n", " self.sigm_out.retain_grad()\n", "\n", " self.multiply_gate_out = self.tanh_out * self.sigm_out\n", " self.multiply_gate_out.retain_grad()\n", "\n", " self.conv_1x1_out = self.conv_1x1(self.multiply_gate_out)\n", " self.conv_1x1_out.retain_grad()\n", "\n", " self.X_skip = torch.zeros_like(self.conv_1x1_out)\n", " if X_skip is not None:\n", " self.X_skip = torchify(np.moveaxis(X_skip, [0, 1, 2], [0, -1, -2]))\n", " self.X_skip.retain_grad()\n", "\n", " self.Y_skip = self.X_skip + self.conv_1x1_out\n", " self.Y_main = self.X_main + self.conv_1x1_out\n", "\n", " self.Y_skip.retain_grad()\n", " self.Y_main.retain_grad()\n", "\n", " def extract_grads(self, X_main, X_skip):\n", " self.forward(X_main, X_skip)\n", " self.loss = (self.Y_skip + self.Y_main).sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)\n", " # X (theirs): (N, C, W) -> X (mine): (N, W, C)\n", " # Y (theirs): (N, C, W) -> Y (mine): (N, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]\n", " grads = {\n", " \"X_main\": np.moveaxis(self.X_main.detach().numpy(), orig, X_swap),\n", " \"X_skip\": np.moveaxis(self.X_skip.detach().numpy(), orig, X_swap),\n", " \"conv_dilation_W\": np.moveaxis(\n", " self.conv_dilation.weight.detach().numpy(), orig, W_swap\n", " ),\n", " \"conv_dilation_b\": self.conv_dilation.bias.detach()\n", " .numpy()\n", " .reshape(1, 1, -1),\n", " \"conv_1x1_W\": np.moveaxis(\n", " self.conv_1x1.weight.detach().numpy(), orig, W_swap\n", " ),\n", " \"conv_1x1_b\": self.conv_1x1.bias.detach().numpy().reshape(1, 1, -1),\n", " \"conv_dilation_out\": np.moveaxis(\n", " self.conv_dilation_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"tanh_out\": np.moveaxis(self.tanh_out.detach().numpy(), orig, X_swap),\n", " \"sigm_out\": np.moveaxis(self.sigm_out.detach().numpy(), orig, X_swap),\n", " \"multiply_gate_out\": np.moveaxis(\n", " self.multiply_gate_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"conv_1x1_out\": np.moveaxis(\n", " self.conv_1x1_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"Y_main\": np.moveaxis(self.Y_main.detach().numpy(), orig, X_swap),\n", " \"Y_skip\": np.moveaxis(self.Y_skip.detach().numpy(), orig, X_swap),\n", " \"dLdY_skip\": np.moveaxis(self.Y_skip.grad.numpy(), orig, X_swap),\n", " \"dLdY_main\": np.moveaxis(self.Y_main.grad.numpy(), orig, X_swap),\n", " \"dLdConv_1x1_out\": np.moveaxis(\n", " self.conv_1x1_out.grad.numpy(), orig, X_swap\n", " ),\n", " \"dLdConv_1x1_W\": np.moveaxis(\n", " self.conv_1x1.weight.grad.numpy(), orig, W_swap\n", " ),\n", " \"dLdConv_1x1_b\": self.conv_1x1.bias.grad.numpy().reshape(1, 1, -1),\n", " \"dLdMultiply_out\": np.moveaxis(\n", " self.multiply_gate_out.grad.numpy(), orig, X_swap\n", " ),\n", " \"dLdTanh_out\": np.moveaxis(self.tanh_out.grad.numpy(), orig, X_swap),\n", " \"dLdSigm_out\": np.moveaxis(self.sigm_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv_dilation_out\": np.moveaxis(\n", " self.conv_dilation_out.grad.numpy(), orig, X_swap\n", " ),\n", " \"dLdConv_dilation_W\": np.moveaxis(\n", " self.conv_dilation.weight.grad.numpy(), orig, W_swap\n", " ),\n", " \"dLdConv_dilation_b\": self.conv_dilation.bias.grad.numpy().reshape(\n", " 1, 1, -1\n", " ),\n", " \"dLdX_main\": np.moveaxis(self.X_main.grad.numpy(), orig, X_swap),\n", " \"dLdX_skip\": np.moveaxis(self.X_skip.grad.numpy(), orig, X_swap),\n", " }\n", "\n", " return grads\n", "\n", "\n", "class TorchSkipConnectionConv(nn.Module):\n", " def __init__(\n", " self, act_fn, pad1, pad2, pad_skip, params, hparams, momentum=0.9, epsilon=1e-5\n", " ):\n", " super(TorchSkipConnectionConv, self).__init__()\n", "\n", " self.conv1 = nn.Conv2d(\n", " hparams[\"in_ch\"],\n", " hparams[\"out_ch1\"],\n", " hparams[\"kernel_shape1\"],\n", " padding=pad1,\n", " stride=hparams[\"stride1\"],\n", " bias=True,\n", " )\n", "\n", " self.act_fn = act_fn\n", "\n", " self.batchnorm1 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch1\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " self.conv2 = nn.Conv2d(\n", " hparams[\"out_ch1\"],\n", " hparams[\"out_ch2\"],\n", " hparams[\"kernel_shape2\"],\n", " padding=pad2,\n", " stride=hparams[\"stride2\"],\n", " bias=True,\n", " )\n", "\n", " self.batchnorm2 = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch2\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " self.conv_skip = nn.Conv2d(\n", " hparams[\"in_ch\"],\n", " hparams[\"out_ch2\"],\n", " hparams[\"kernel_shape_skip\"],\n", " padding=pad_skip,\n", " stride=hparams[\"stride_skip\"],\n", " bias=True,\n", " )\n", "\n", " self.batchnorm_skip = nn.BatchNorm2d(\n", " num_features=hparams[\"out_ch2\"],\n", " momentum=1 - momentum,\n", " eps=epsilon,\n", " affine=True,\n", " )\n", "\n", " orig, W_swap = [0, 1, 2, 3], [-2, -1, -3, -4]\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv1\"][\"W\"]\n", " b = params[\"components\"][\"conv1\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv1.weight.shape == W.shape\n", " assert self.conv1.bias.shape == b.flatten().shape\n", " self.conv1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm1\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm1\"][\"intercept\"]\n", " self.batchnorm1.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm1.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = params[\"components\"][\"conv2\"][\"W\"]\n", " b = params[\"components\"][\"conv2\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv2.weight.shape == W.shape\n", " assert self.conv2.bias.shape == b.flatten().shape\n", " self.conv2.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv2.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm2\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm2\"][\"intercept\"]\n", " self.batchnorm2.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm2.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " W = params[\"components\"][\"conv_skip\"][\"W\"]\n", " b = params[\"components\"][\"conv_skip\"][\"b\"]\n", " W = np.moveaxis(W, orig, W_swap)\n", " assert self.conv_skip.weight.shape == W.shape\n", " assert self.conv_skip.bias.shape == b.flatten().shape\n", " self.conv_skip.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.conv_skip.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " scaler = params[\"components\"][\"batchnorm_skip\"][\"scaler\"]\n", " intercept = params[\"components\"][\"batchnorm_skip\"][\"intercept\"]\n", " self.batchnorm_skip.weight = nn.Parameter(torch.FloatTensor(scaler))\n", " self.batchnorm_skip.bias = nn.Parameter(torch.FloatTensor(intercept))\n", "\n", " def forward(self, X):\n", " if not isinstance(X, torch.Tensor):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " X = torchify(X)\n", "\n", " self.X = X\n", " self.X.retain_grad()\n", "\n", " self.conv1_out = self.conv1(self.X)\n", " self.conv1_out.retain_grad()\n", "\n", " self.act_fn1_out = self.act_fn(self.conv1_out)\n", " self.act_fn1_out.retain_grad()\n", "\n", " self.batchnorm1_out = self.batchnorm1(self.act_fn1_out)\n", " self.batchnorm1_out.retain_grad()\n", "\n", " self.conv2_out = self.conv2(self.batchnorm1_out)\n", " self.conv2_out.retain_grad()\n", "\n", " self.batchnorm2_out = self.batchnorm2(self.conv2_out)\n", " self.batchnorm2_out.retain_grad()\n", "\n", " self.c_skip_out = self.conv_skip(self.X)\n", " self.c_skip_out.retain_grad()\n", "\n", " self.bn_skip_out = self.batchnorm_skip(self.c_skip_out)\n", " self.bn_skip_out.retain_grad()\n", "\n", " self.layer3_in = self.batchnorm2_out + self.bn_skip_out\n", " self.layer3_in.retain_grad()\n", "\n", " self.Y = self.act_fn(self.layer3_in)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]\n", " grads = {\n", " # layer parameters\n", " \"conv1_W\": np.moveaxis(self.conv1.weight.detach().numpy(), orig, W_swap),\n", " \"conv1_b\": self.conv1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn1_intercept\": self.batchnorm1.bias.detach().numpy(),\n", " \"bn1_scaler\": self.batchnorm1.weight.detach().numpy(),\n", " \"bn1_running_mean\": self.batchnorm1.running_mean.detach().numpy(),\n", " \"bn1_running_var\": self.batchnorm1.running_var.detach().numpy(),\n", " \"conv2_W\": np.moveaxis(self.conv2.weight.detach().numpy(), orig, W_swap),\n", " \"conv2_b\": self.conv2.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn2_intercept\": self.batchnorm2.bias.detach().numpy(),\n", " \"bn2_scaler\": self.batchnorm2.weight.detach().numpy(),\n", " \"bn2_running_mean\": self.batchnorm2.running_mean.detach().numpy(),\n", " \"bn2_running_var\": self.batchnorm2.running_var.detach().numpy(),\n", " \"conv_skip_W\": np.moveaxis(\n", " self.conv_skip.weight.detach().numpy(), orig, W_swap\n", " ),\n", " \"conv_skip_b\": self.conv_skip.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"bn_skip_intercept\": self.batchnorm_skip.bias.detach().numpy(),\n", " \"bn_skip_scaler\": self.batchnorm_skip.weight.detach().numpy(),\n", " \"bn_skip_running_mean\": self.batchnorm_skip.running_mean.detach().numpy(),\n", " \"bn_skip_running_var\": self.batchnorm_skip.running_var.detach().numpy(),\n", " # layer inputs/outputs (forward step)\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"conv1_out\": np.moveaxis(self.conv1_out.detach().numpy(), orig, X_swap),\n", " \"act1_out\": np.moveaxis(self.act_fn1_out.detach().numpy(), orig, X_swap),\n", " \"bn1_out\": np.moveaxis(self.batchnorm1_out.detach().numpy(), orig, X_swap),\n", " \"conv2_out\": np.moveaxis(self.conv2_out.detach().numpy(), orig, X_swap),\n", " \"bn2_out\": np.moveaxis(self.batchnorm2_out.detach().numpy(), orig, X_swap),\n", " \"conv_skip_out\": np.moveaxis(\n", " self.c_skip_out.detach().numpy(), orig, X_swap\n", " ),\n", " \"bn_skip_out\": np.moveaxis(self.bn_skip_out.detach().numpy(), orig, X_swap),\n", " \"add_out\": np.moveaxis(self.layer3_in.detach().numpy(), orig, X_swap),\n", " \"Y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " # layer gradients (backward step)\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdAdd\": np.moveaxis(self.layer3_in.grad.numpy(), orig, X_swap),\n", " \"dLdBnSkip_out\": np.moveaxis(self.bn_skip_out.grad.numpy(), orig, X_swap),\n", " \"dLdConvSkip_out\": np.moveaxis(self.c_skip_out.grad.numpy(), orig, X_swap),\n", " \"dLdBn2_out\": np.moveaxis(self.batchnorm2_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv2_out\": np.moveaxis(self.conv2_out.grad.numpy(), orig, X_swap),\n", " \"dLdBn1_out\": np.moveaxis(self.batchnorm1_out.grad.numpy(), orig, X_swap),\n", " \"dLdActFn1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdConv1_out\": np.moveaxis(self.act_fn1_out.grad.numpy(), orig, X_swap),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " # layer parameter gradients (backward step)\n", " \"dLdBnSkip_intercept\": self.batchnorm_skip.bias.grad.numpy(),\n", " \"dLdBnSkip_scaler\": self.batchnorm_skip.weight.grad.numpy(),\n", " \"dLdConvSkip_W\": np.moveaxis(\n", " self.conv_skip.weight.grad.numpy(), orig, W_swap\n", " ),\n", " \"dLdConvSkip_b\": self.conv_skip.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdBn2_intercept\": self.batchnorm2.bias.grad.numpy(),\n", " \"dLdBn2_scaler\": self.batchnorm2.weight.grad.numpy(),\n", " \"dLdConv2_W\": np.moveaxis(self.conv2.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv2_b\": self.conv2.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdBn1_intercept\": self.batchnorm1.bias.grad.numpy(),\n", " \"dLdBn1_scaler\": self.batchnorm1.weight.grad.numpy(),\n", " \"dLdConv1_W\": np.moveaxis(self.conv1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdConv1_b\": self.conv1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " }\n", " return grads\n", "\n", "\n", "class TorchBidirectionalLSTM(nn.Module):\n", " def __init__(self, n_in, n_out, params, **kwargs):\n", " super(TorchBidirectionalLSTM, self).__init__()\n", "\n", " self.layer1 = nn.LSTM(\n", " input_size=n_in,\n", " hidden_size=n_out,\n", " num_layers=1,\n", " bidirectional=True,\n", " bias=True,\n", " )\n", "\n", " Wiu = params[\"components\"][\"cell_fwd\"][\"Wu\"][n_out:, :].T\n", " Wif = params[\"components\"][\"cell_fwd\"][\"Wf\"][n_out:, :].T\n", " Wic = params[\"components\"][\"cell_fwd\"][\"Wc\"][n_out:, :].T\n", " Wio = params[\"components\"][\"cell_fwd\"][\"Wo\"][n_out:, :].T\n", " W_ih_f = np.vstack([Wiu, Wif, Wic, Wio])\n", "\n", " Whu = params[\"components\"][\"cell_fwd\"][\"Wu\"][:n_out, :].T\n", " Whf = params[\"components\"][\"cell_fwd\"][\"Wf\"][:n_out, :].T\n", " Whc = params[\"components\"][\"cell_fwd\"][\"Wc\"][:n_out, :].T\n", " Who = params[\"components\"][\"cell_fwd\"][\"Wo\"][:n_out, :].T\n", " W_hh_f = np.vstack([Whu, Whf, Whc, Who])\n", "\n", " assert self.layer1.weight_ih_l0.shape == W_ih_f.shape\n", " assert self.layer1.weight_hh_l0.shape == W_hh_f.shape\n", "\n", " self.layer1.weight_ih_l0 = nn.Parameter(torch.FloatTensor(W_ih_f))\n", " self.layer1.weight_hh_l0 = nn.Parameter(torch.FloatTensor(W_hh_f))\n", "\n", " Wiu = params[\"components\"][\"cell_bwd\"][\"Wu\"][n_out:, :].T\n", " Wif = params[\"components\"][\"cell_bwd\"][\"Wf\"][n_out:, :].T\n", " Wic = params[\"components\"][\"cell_bwd\"][\"Wc\"][n_out:, :].T\n", " Wio = params[\"components\"][\"cell_bwd\"][\"Wo\"][n_out:, :].T\n", " W_ih_b = np.vstack([Wiu, Wif, Wic, Wio])\n", "\n", " Whu = params[\"components\"][\"cell_bwd\"][\"Wu\"][:n_out, :].T\n", " Whf = params[\"components\"][\"cell_bwd\"][\"Wf\"][:n_out, :].T\n", " Whc = params[\"components\"][\"cell_bwd\"][\"Wc\"][:n_out, :].T\n", " Who = params[\"components\"][\"cell_bwd\"][\"Wo\"][:n_out, :].T\n", " W_hh_b = np.vstack([Whu, Whf, Whc, Who])\n", "\n", " assert self.layer1.weight_ih_l0_reverse.shape == W_ih_b.shape\n", " assert self.layer1.weight_hh_l0_reverse.shape == W_hh_b.shape\n", "\n", " self.layer1.weight_ih_l0_reverse = nn.Parameter(torch.FloatTensor(W_ih_b))\n", " self.layer1.weight_hh_l0_reverse = nn.Parameter(torch.FloatTensor(W_hh_b))\n", "\n", " b_f = np.concatenate(\n", " [\n", " params[\"components\"][\"cell_fwd\"][\"bu\"],\n", " params[\"components\"][\"cell_fwd\"][\"bf\"],\n", " params[\"components\"][\"cell_fwd\"][\"bc\"],\n", " params[\"components\"][\"cell_fwd\"][\"bo\"],\n", " ],\n", " axis=-1,\n", " ).flatten()\n", "\n", " assert self.layer1.bias_ih_l0.shape == b_f.shape\n", " assert self.layer1.bias_hh_l0.shape == b_f.shape\n", "\n", " self.layer1.bias_ih_l0 = nn.Parameter(torch.FloatTensor(b_f))\n", " self.layer1.bias_hh_l0 = nn.Parameter(torch.FloatTensor(b_f))\n", "\n", " b_b = np.concatenate(\n", " [\n", " params[\"components\"][\"cell_bwd\"][\"bu\"],\n", " params[\"components\"][\"cell_bwd\"][\"bf\"],\n", " params[\"components\"][\"cell_bwd\"][\"bc\"],\n", " params[\"components\"][\"cell_bwd\"][\"bo\"],\n", " ],\n", " axis=-1,\n", " ).flatten()\n", "\n", " assert self.layer1.bias_ih_l0_reverse.shape == b_b.shape\n", " assert self.layer1.bias_hh_l0_reverse.shape == b_b.shape\n", "\n", " self.layer1.bias_ih_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))\n", " self.layer1.bias_hh_l0_reverse = nn.Parameter(torch.FloatTensor(b_b))\n", "\n", " def forward(self, X):\n", " # (batch, input_size, seq_len) -> (seq_len, batch, input_size)\n", " self.X = np.moveaxis(X, [0, 1, 2], [-2, -1, -3])\n", "\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " # initial hidden state is 0\n", " n_ex, n_in, n_timesteps = self.X.shape\n", " n_out, n_out = self.layer1.weight_hh_l0.shape\n", "\n", " # forward pass\n", " self.A, (At, Ct) = self.layer1(self.X)\n", " self.A.retain_grad()\n", " return self.A\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.A.sum()\n", " self.loss.backward()\n", "\n", " # forward\n", " w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0.chunk(4, 0)\n", " w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0.chunk(4, 0)\n", " bu_f, bf_f, bc_f, bo_f = self.layer1.bias_ih_l0.chunk(4, 0)\n", "\n", " Wu_f = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)\n", " Wf_f = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)\n", " Wc_f = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)\n", " Wo_f = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)\n", "\n", " dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0.grad.chunk(4, 0)\n", " dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0.grad.chunk(4, 0)\n", " dbu_f, dbf_f, dbc_f, dbo_f = self.layer1.bias_ih_l0.grad.chunk(4, 0)\n", "\n", " dWu_f = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)\n", " dWf_f = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)\n", " dWc_f = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)\n", " dWo_f = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)\n", "\n", " # backward\n", " w_ii, w_if, w_ic, w_io = self.layer1.weight_ih_l0_reverse.chunk(4, 0)\n", " w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh_l0_reverse.chunk(4, 0)\n", " bu_b, bf_b, bc_b, bo_b = self.layer1.bias_ih_l0_reverse.chunk(4, 0)\n", "\n", " Wu_b = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)\n", " Wf_b = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)\n", " Wc_b = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)\n", " Wo_b = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)\n", "\n", " dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih_l0_reverse.grad.chunk(4, 0)\n", " dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh_l0_reverse.grad.chunk(4, 0)\n", " dbu_b, dbf_b, dbc_b, dbo_b = self.layer1.bias_ih_l0_reverse.grad.chunk(4, 0)\n", "\n", " dWu_b = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)\n", " dWf_b = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)\n", " dWc_b = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)\n", " dWo_b = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)\n", "\n", " orig, X_swap = [0, 1, 2], [-1, -3, -2]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"Wu_f\": Wu_f.detach().numpy(),\n", " \"Wf_f\": Wf_f.detach().numpy(),\n", " \"Wc_f\": Wc_f.detach().numpy(),\n", " \"Wo_f\": Wo_f.detach().numpy(),\n", " \"bu_f\": bu_f.detach().numpy().reshape(-1, 1),\n", " \"bf_f\": bf_f.detach().numpy().reshape(-1, 1),\n", " \"bc_f\": bc_f.detach().numpy().reshape(-1, 1),\n", " \"bo_f\": bo_f.detach().numpy().reshape(-1, 1),\n", " \"Wu_b\": Wu_b.detach().numpy(),\n", " \"Wf_b\": Wf_b.detach().numpy(),\n", " \"Wc_b\": Wc_b.detach().numpy(),\n", " \"Wo_b\": Wo_b.detach().numpy(),\n", " \"bu_b\": bu_b.detach().numpy().reshape(-1, 1),\n", " \"bf_b\": bf_b.detach().numpy().reshape(-1, 1),\n", " \"bc_b\": bc_b.detach().numpy().reshape(-1, 1),\n", " \"bo_b\": bo_b.detach().numpy().reshape(-1, 1),\n", " \"y\": np.moveaxis(self.A.detach().numpy(), orig, X_swap),\n", " \"dLdA\": self.A.grad.numpy(),\n", " \"dLdWu_f\": dWu_f.numpy(),\n", " \"dLdWf_f\": dWf_f.numpy(),\n", " \"dLdWc_f\": dWc_f.numpy(),\n", " \"dLdWo_f\": dWo_f.numpy(),\n", " \"dLdBu_f\": dbu_f.numpy().reshape(-1, 1),\n", " \"dLdBf_f\": dbf_f.numpy().reshape(-1, 1),\n", " \"dLdBc_f\": dbc_f.numpy().reshape(-1, 1),\n", " \"dLdBo_f\": dbo_f.numpy().reshape(-1, 1),\n", " \"dLdWu_b\": dWu_b.numpy(),\n", " \"dLdWf_b\": dWf_b.numpy(),\n", " \"dLdWc_b\": dWc_b.numpy(),\n", " \"dLdWo_b\": dWo_b.numpy(),\n", " \"dLdBu_b\": dbu_b.numpy().reshape(-1, 1),\n", " \"dLdBf_b\": dbf_b.numpy().reshape(-1, 1),\n", " \"dLdBc_b\": dbc_b.numpy().reshape(-1, 1),\n", " \"dLdBo_b\": dbo_b.numpy().reshape(-1, 1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchPool2DLayer(nn.Module):\n", " def __init__(self, in_channels, hparams, **kwargs):\n", " super(TorchPool2DLayer, self).__init__()\n", "\n", " if hparams[\"mode\"] == \"max\":\n", " self.layer1 = nn.MaxPool2d(\n", " kernel_size=hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " )\n", " elif hparams[\"mode\"] == \"average\":\n", " self.layer1 = nn.AvgPool2d(\n", " kernel_size=hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " )\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", " self.Y = self.layer1(self.X)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)\n", " # X (theirs): (N, C, H, W) -> X (mine): (N, H, W, C)\n", " # Y (theirs): (N, C, H, W) -> Y (mine): (N, H, W, C)\n", " orig, X_swap = [0, 1, 2, 3], [0, -1, -3, -2]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchConv2DLayer(nn.Module):\n", " def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):\n", " super(TorchConv2DLayer, self).__init__()\n", "\n", " W = params[\"W\"]\n", " b = params[\"b\"]\n", " self.act_fn = act_fn\n", "\n", " self.layer1 = nn.Conv2d(\n", " in_channels,\n", " out_channels,\n", " hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " dilation=hparams[\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_out, n_in, f[0], f[1])\n", " W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -3, -4])\n", " assert self.layer1.weight.shape == W.shape\n", " assert self.layer1.bias.shape == b.flatten().shape\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " self.Z = self.layer1(self.X)\n", " self.Z.retain_grad()\n", "\n", " self.Y = self.act_fn(self.Z)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)\n", " # X (theirs): (N, C, H, W) -> X (mine): (N, H, W, C)\n", " # Y (theirs): (N, C, H, W) -> Y (mine): (N, H, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-1, -2, -4, -3]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"W\": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),\n", " \"b\": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdZ\": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),\n", " \"dLdW\": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdB\": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchConv1DLayer(nn.Module):\n", " def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):\n", " super(TorchConv1DLayer, self).__init__()\n", "\n", " W = params[\"W\"]\n", " b = params[\"b\"]\n", " self.act_fn = act_fn\n", "\n", " self.layer1 = nn.Conv1d(\n", " in_channels,\n", " out_channels,\n", " hparams[\"kernel_width\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " dilation=hparams[\"dilation\"] + 1,\n", " bias=True,\n", " )\n", "\n", " # (f[0], n_in, n_out) -> (n_out, n_in, f[0])\n", " W = np.moveaxis(W, [0, 1, 2], [-1, -2, -3])\n", " assert self.layer1.weight.shape == W.shape\n", " assert self.layer1.bias.shape == b.flatten().shape\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " def forward(self, X):\n", " # (N, W, C) -> (N, C, W)\n", " self.X = np.moveaxis(X, [0, 1, 2], [0, -1, -2])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " self.Z = self.layer1(self.X)\n", " self.Z.retain_grad()\n", "\n", " self.Y = self.act_fn(self.Z)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_out, n_in, f[0]) -> W (mine): (f[0], n_in, n_out)\n", " # X (theirs): (N, C, W) -> X (mine): (N, W, C)\n", " # Y (theirs): (N, C, W) -> Y (mine): (N, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2], [0, -1, -2], [-1, -2, -3]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"W\": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),\n", " \"b\": self.layer1.bias.detach().numpy().reshape(1, 1, -1),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdZ\": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),\n", " \"dLdW\": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdB\": self.layer1.bias.grad.numpy().reshape(1, 1, -1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchDeconv2DLayer(nn.Module):\n", " def __init__(self, in_channels, out_channels, act_fn, params, hparams, **kwargs):\n", " super(TorchDeconv2DLayer, self).__init__()\n", "\n", " W = params[\"W\"]\n", " b = params[\"b\"]\n", " self.act_fn = act_fn\n", "\n", " self.layer1 = nn.ConvTranspose2d(\n", " in_channels,\n", " out_channels,\n", " hparams[\"kernel_shape\"],\n", " padding=hparams[\"pad\"],\n", " stride=hparams[\"stride\"],\n", " dilation=1,\n", " bias=True,\n", " )\n", "\n", " # (f[0], f[1], n_in, n_out) -> (n_in, n_out, f[0], f[1])\n", " W = np.moveaxis(W, [0, 1, 2, 3], [-2, -1, -4, -3])\n", " assert self.layer1.weight.shape == W.shape\n", " assert self.layer1.bias.shape == b.flatten().shape\n", "\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(W))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(b.flatten()))\n", "\n", " def forward(self, X):\n", " # (N, H, W, C) -> (N, C, H, W)\n", " self.X = np.moveaxis(X, [0, 1, 2, 3], [0, -2, -1, -3])\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " self.Z = self.layer1(self.X)\n", " self.Z.retain_grad()\n", "\n", " self.Y = self.act_fn(self.Z)\n", " self.Y.retain_grad()\n", " return self.Y\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = self.Y.sum()\n", " self.loss.backward()\n", "\n", " # W (theirs): (n_in, n_out, f[0], f[1]) -> W (mine): (f[0], f[1], n_in, n_out)\n", " # X (theirs): (N, C, H, W) -> X (mine): (N, H, W, C)\n", " # Y (theirs): (N, C, H, W) -> Y (mine): (N, H, W, C)\n", " orig, X_swap, W_swap = [0, 1, 2, 3], [0, -1, -3, -2], [-2, -1, -4, -3]\n", " grads = {\n", " \"X\": np.moveaxis(self.X.detach().numpy(), orig, X_swap),\n", " \"W\": np.moveaxis(self.layer1.weight.detach().numpy(), orig, W_swap),\n", " \"b\": self.layer1.bias.detach().numpy().reshape(1, 1, 1, -1),\n", " \"y\": np.moveaxis(self.Y.detach().numpy(), orig, X_swap),\n", " \"dLdY\": np.moveaxis(self.Y.grad.numpy(), orig, X_swap),\n", " \"dLdZ\": np.moveaxis(self.Z.grad.numpy(), orig, X_swap),\n", " \"dLdW\": np.moveaxis(self.layer1.weight.grad.numpy(), orig, W_swap),\n", " \"dLdB\": self.layer1.bias.grad.numpy().reshape(1, 1, 1, -1),\n", " \"dLdX\": np.moveaxis(self.X.grad.numpy(), orig, X_swap),\n", " }\n", " return grads\n", "\n", "\n", "class TorchLSTMCell(nn.Module):\n", " def __init__(self, n_in, n_out, params, **kwargs):\n", " super(TorchLSTMCell, self).__init__()\n", "\n", " Wiu = params[\"Wu\"][n_out:, :].T\n", " Wif = params[\"Wf\"][n_out:, :].T\n", " Wic = params[\"Wc\"][n_out:, :].T\n", " Wio = params[\"Wo\"][n_out:, :].T\n", " W_ih = np.vstack([Wiu, Wif, Wic, Wio])\n", "\n", " Whu = params[\"Wu\"][:n_out, :].T\n", " Whf = params[\"Wf\"][:n_out, :].T\n", " Whc = params[\"Wc\"][:n_out, :].T\n", " Who = params[\"Wo\"][:n_out, :].T\n", " W_hh = np.vstack([Whu, Whf, Whc, Who])\n", "\n", " self.layer1 = nn.LSTMCell(input_size=n_in, hidden_size=n_out, bias=True)\n", " assert self.layer1.weight_ih.shape == W_ih.shape\n", " assert self.layer1.weight_hh.shape == W_hh.shape\n", " self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(W_ih))\n", " self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(W_hh))\n", "\n", " b = np.concatenate(\n", " [params[\"bu\"], params[\"bf\"], params[\"bc\"], params[\"bo\"]], axis=-1\n", " ).flatten()\n", " assert self.layer1.bias_ih.shape == b.shape\n", " assert self.layer1.bias_hh.shape == b.shape\n", " self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(b))\n", " self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(b))\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " # initial hidden state is 0\n", " n_ex, n_in, n_timesteps = self.X.shape\n", " n_out, n_out = self.layer1.weight_hh.shape\n", "\n", " # initialize hidden states\n", " a0 = torchify(np.zeros((n_ex, n_out)))\n", " c0 = torchify(np.zeros((n_ex, n_out)))\n", " a0.retain_grad()\n", " c0.retain_grad()\n", "\n", " # forward pass\n", " A, C = [], []\n", " at = a0\n", " ct = c0\n", " for t in range(n_timesteps):\n", " A.append(at)\n", " C.append(ct)\n", " at1, ct1 = self.layer1(self.X[:, :, t], (at, ct))\n", " at.retain_grad()\n", " ct.retain_grad()\n", " at = at1\n", " ct = ct1\n", "\n", " at.retain_grad()\n", " ct.retain_grad()\n", " A.append(at)\n", " C.append(ct)\n", "\n", " # don't inclue a0 in our outputs\n", " self.A = A[1:]\n", " self.C = C[1:]\n", " return self.A, self.C\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = torch.stack(self.A).sum()\n", " self.loss.backward()\n", "\n", " w_ii, w_if, w_ic, w_io = self.layer1.weight_ih.chunk(4, 0)\n", " w_hi, w_hf, w_hc, w_ho = self.layer1.weight_hh.chunk(4, 0)\n", " bu, bf, bc, bo = self.layer1.bias_ih.chunk(4, 0)\n", "\n", " Wu = torch.cat([torch.t(w_hi), torch.t(w_ii)], dim=0)\n", " Wf = torch.cat([torch.t(w_hf), torch.t(w_if)], dim=0)\n", " Wc = torch.cat([torch.t(w_hc), torch.t(w_ic)], dim=0)\n", " Wo = torch.cat([torch.t(w_ho), torch.t(w_io)], dim=0)\n", "\n", " dw_ii, dw_if, dw_ic, dw_io = self.layer1.weight_ih.grad.chunk(4, 0)\n", " dw_hi, dw_hf, dw_hc, dw_ho = self.layer1.weight_hh.grad.chunk(4, 0)\n", " dbu, dbf, dbc, dbo = self.layer1.bias_ih.grad.chunk(4, 0)\n", "\n", " dWu = torch.cat([torch.t(dw_hi), torch.t(dw_ii)], dim=0)\n", " dWf = torch.cat([torch.t(dw_hf), torch.t(dw_if)], dim=0)\n", " dWc = torch.cat([torch.t(dw_hc), torch.t(dw_ic)], dim=0)\n", " dWo = torch.cat([torch.t(dw_ho), torch.t(dw_io)], dim=0)\n", "\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"Wu\": Wu.detach().numpy(),\n", " \"Wf\": Wf.detach().numpy(),\n", " \"Wc\": Wc.detach().numpy(),\n", " \"Wo\": Wo.detach().numpy(),\n", " \"bu\": bu.detach().numpy().reshape(-1, 1),\n", " \"bf\": bf.detach().numpy().reshape(-1, 1),\n", " \"bc\": bc.detach().numpy().reshape(-1, 1),\n", " \"bo\": bo.detach().numpy().reshape(-1, 1),\n", " \"C\": torch.stack(self.C).detach().numpy(),\n", " \"y\": np.swapaxes(\n", " np.swapaxes(torch.stack(self.A).detach().numpy(), 1, 0), 1, 2\n", " ),\n", " \"dLdA\": np.array([a.grad.numpy() for a in self.A]),\n", " \"dLdWu\": dWu.numpy(),\n", " \"dLdWf\": dWf.numpy(),\n", " \"dLdWc\": dWc.numpy(),\n", " \"dLdWo\": dWo.numpy(),\n", " \"dLdBu\": dbu.numpy().reshape(-1, 1),\n", " \"dLdBf\": dbf.numpy().reshape(-1, 1),\n", " \"dLdBc\": dbc.numpy().reshape(-1, 1),\n", " \"dLdBo\": dbo.numpy().reshape(-1, 1),\n", " \"dLdX\": self.X.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchRNNCell(nn.Module):\n", " def __init__(self, n_in, n_hid, params, **kwargs):\n", " super(TorchRNNCell, self).__init__()\n", "\n", " self.layer1 = nn.RNNCell(n_in, n_hid, bias=True, nonlinearity=\"tanh\")\n", "\n", " # set weights and bias to match those of RNNCell\n", " # NB: we pass the *transpose* of the RNNCell weights and biases to\n", " # pytorch, meaning we need to check against the *transpose* of our\n", " # outputs for any function of the weights\n", " self.layer1.weight_ih = nn.Parameter(torch.FloatTensor(params[\"Wax\"].T))\n", " self.layer1.weight_hh = nn.Parameter(torch.FloatTensor(params[\"Waa\"].T))\n", " self.layer1.bias_ih = nn.Parameter(torch.FloatTensor(params[\"bx\"].T))\n", " self.layer1.bias_hh = nn.Parameter(torch.FloatTensor(params[\"ba\"].T))\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(self.X, torch.Tensor):\n", " self.X = torchify(self.X)\n", "\n", " self.X.retain_grad()\n", "\n", " # initial hidden state is 0\n", " n_ex, n_in, n_timesteps = self.X.shape\n", " n_out, n_out = self.layer1.weight_hh.shape\n", "\n", " # initialize hidden states\n", " a0 = torchify(np.zeros((n_ex, n_out)))\n", " a0.retain_grad()\n", "\n", " # forward pass\n", " A = []\n", " at = a0\n", " for t in range(n_timesteps):\n", " A += [at]\n", " at1 = self.layer1(self.X[:, :, t], at)\n", " at.retain_grad()\n", " at = at1\n", "\n", " at.retain_grad()\n", " A += [at]\n", "\n", " # don't inclue a0 in our outputs\n", " self.A = A[1:]\n", " return self.A\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss = torch.stack(self.A).sum()\n", " self.loss.backward()\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"ba\": self.layer1.bias_hh.detach().numpy(),\n", " \"bx\": self.layer1.bias_ih.detach().numpy(),\n", " \"Wax\": self.layer1.weight_ih.detach().numpy(),\n", " \"Waa\": self.layer1.weight_hh.detach().numpy(),\n", " \"y\": torch.stack(self.A).detach().numpy(),\n", " \"dLdA\": np.array([a.grad.numpy() for a in self.A]),\n", " \"dLdWaa\": self.layer1.weight_hh.grad.numpy(),\n", " \"dLdWax\": self.layer1.weight_ih.grad.numpy(),\n", " \"dLdBa\": self.layer1.bias_hh.grad.numpy(),\n", " \"dLdBx\": self.layer1.bias_ih.grad.numpy(),\n", " \"dLdX\": self.X.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchFCLayer(nn.Module):\n", " def __init__(self, n_in, n_hid, act_fn, params, **kwargs):\n", " super(TorchFCLayer, self).__init__()\n", " self.layer1 = nn.Linear(n_in, n_hid)\n", "\n", " # explicitly set weights and bias\n", " # NB: we pass the *transpose* of the weights to pytorch, meaning\n", " # we'll need to check against the *transpose* of our outputs for\n", " # any function of the weights\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(params[\"W\"].T))\n", " self.layer1.bias = nn.Parameter(torch.FloatTensor(params[\"b\"]))\n", "\n", " self.act_fn = act_fn\n", " self.model = nn.Sequential(self.layer1, self.act_fn)\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(X, torch.Tensor):\n", " self.X = torchify(X)\n", "\n", " self.z1 = self.layer1(self.X)\n", " self.z1.retain_grad()\n", "\n", " self.out1 = self.act_fn(self.z1)\n", " self.out1.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss1 = self.out1.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"b\": self.layer1.bias.detach().numpy(),\n", " \"W\": self.layer1.weight.detach().numpy(),\n", " \"y\": self.out1.detach().numpy(),\n", " \"dLdy\": self.out1.grad.numpy(),\n", " \"dLdZ\": self.z1.grad.numpy(),\n", " \"dLdB\": self.layer1.bias.grad.numpy(),\n", " \"dLdW\": self.layer1.weight.grad.numpy(),\n", " \"dLdX\": self.X.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchEmbeddingLayer(nn.Module):\n", " def __init__(self, vocab_size, n_out, params, **kwargs):\n", " super(TorchEmbeddingLayer, self).__init__()\n", " self.layer1 = nn.Embedding(vocab_size, n_out)\n", "\n", " # explicitly set embedding weights\n", " self.layer1.weight = nn.Parameter(torch.FloatTensor(params[\"W\"]))\n", " self.model = nn.Sequential(self.layer1)\n", "\n", " def forward(self, X):\n", " self.X = X\n", " if not isinstance(X, torch.Tensor):\n", " self.X = torch.from_numpy(X)\n", "\n", " self.out1 = self.layer1(self.X)\n", " self.out1.retain_grad()\n", "\n", " def extract_grads(self, X):\n", " self.forward(X)\n", " self.loss1 = self.out1.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"X\": self.X.detach().numpy(),\n", " \"W\": self.layer1.weight.detach().numpy(),\n", " \"y\": self.out1.detach().numpy(),\n", " \"dLdy\": self.out1.grad.numpy(),\n", " \"dLdW\": self.layer1.weight.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchSDPAttentionLayer(nn.Module):\n", " def __init__(self):\n", " super(TorchSDPAttentionLayer, self).__init__()\n", "\n", " def forward(self, Q, K, V, mask=None):\n", " self.Q = Q\n", " self.K = K\n", " self.V = V\n", "\n", " if not isinstance(self.Q, torch.Tensor):\n", " self.Q = torchify(self.Q)\n", " if not isinstance(self.K, torch.Tensor):\n", " self.K = torchify(self.K)\n", " if not isinstance(self.V, torch.Tensor):\n", " self.V = torchify(self.V)\n", "\n", " self.Q.retain_grad()\n", " self.K.retain_grad()\n", " self.V.retain_grad()\n", "\n", " self.d_k = self.Q.size(-1)\n", " self.scores = torch.matmul(self.Q, self.K.transpose(-2, -1)) / np.sqrt(self.d_k)\n", " if mask is not None:\n", " self.scores = self.scores.masked_fill(mask == 0, -1e9)\n", " self.scores.retain_grad()\n", "\n", " self.weights = F.softmax(self.scores, dim=-1)\n", " self.weights.retain_grad()\n", " self.Y = torch.matmul(self.weights, self.V)\n", " self.Y.retain_grad()\n", " return self.Y, self.weights\n", "\n", " def extract_grads(self, Q, K, V, mask=None):\n", " self.forward(Q, K, V, mask=mask)\n", " self.loss1 = self.Y.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"Q\": self.Q.detach().numpy(),\n", " \"K\": self.K.detach().numpy(),\n", " \"V\": self.V.detach().numpy(),\n", " \"d_k\": self.d_k,\n", " \"scores\": self.scores.detach().numpy(),\n", " \"weights\": self.weights.detach().numpy(),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dLdV\": self.V.grad.numpy(),\n", " \"dWeights\": self.weights.grad.numpy(),\n", " \"dScores\": self.scores.grad.numpy(),\n", " \"dLdQ\": self.Q.grad.numpy(),\n", " \"dLdK\": self.K.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "class TorchMultiHeadedAttentionModule(nn.Module):\n", " def __init__(self, params, hparams):\n", " \"Take in model size and number of heads.\"\n", " super(TorchMultiHeadedAttentionModule, self).__init__()\n", " assert hparams[\"kqv_dim\"] % hparams[\"n_heads\"] == 0\n", " self.n_heads = hparams[\"n_heads\"]\n", " self.latent_dim = hparams[\"kqv_dim\"] // hparams[\"n_heads\"]\n", " self.p_dropout = hparams[\"dropout_p\"]\n", " self.projections = {\n", " \"Q\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " \"K\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " \"V\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " \"O\": nn.Linear(hparams[\"kqv_dim\"], hparams[\"kqv_dim\"]),\n", " }\n", " self.projections[\"Q\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"Q\"][\"W\"].T)\n", " )\n", " self.projections[\"Q\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"Q\"][\"b\"])\n", " )\n", " self.projections[\"K\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"K\"][\"W\"].T)\n", " )\n", " self.projections[\"K\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"K\"][\"b\"])\n", " )\n", " self.projections[\"V\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"V\"][\"W\"].T)\n", " )\n", " self.projections[\"V\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"V\"][\"b\"])\n", " )\n", " self.projections[\"O\"].weight = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"O\"][\"W\"].T)\n", " )\n", " self.projections[\"O\"].bias = nn.Parameter(\n", " torch.FloatTensor(params[\"components\"][\"O\"][\"b\"])\n", " )\n", "\n", " self.attn = None\n", " self.dropout = nn.Dropout(p=hparams[\"dropout_p\"])\n", "\n", " def forward(self, Q, K, V, mask=None):\n", " self.Q = Q\n", " self.K = K\n", " self.V = V\n", "\n", " if not isinstance(self.Q, torch.Tensor):\n", " self.Q = torchify(self.Q)\n", " if not isinstance(self.K, torch.Tensor):\n", " self.K = torchify(self.K)\n", " if not isinstance(self.V, torch.Tensor):\n", " self.V = torchify(self.V)\n", "\n", " self.Q.retain_grad()\n", " self.K.retain_grad()\n", " self.V.retain_grad()\n", "\n", " if mask is not None:\n", " # Same mask applied to all h heads.\n", " mask = mask.unsqueeze(1)\n", " n_ex = self.Q.size(0)\n", "\n", " self.Q_proj = (\n", " self.projections[\"Q\"](self.Q)\n", " .view(n_ex, -1, self.n_heads, self.latent_dim)\n", " .transpose(1, 2)\n", " )\n", "\n", " self.K_proj = (\n", " self.projections[\"K\"](self.K)\n", " .view(n_ex, -1, self.n_heads, self.latent_dim)\n", " .transpose(1, 2)\n", " )\n", "\n", " self.V_proj = (\n", " self.projections[\"V\"](self.V)\n", " .view(n_ex, -1, self.n_heads, self.latent_dim)\n", " .transpose(1, 2)\n", " )\n", "\n", " self.Q_proj.retain_grad()\n", " self.K_proj.retain_grad()\n", " self.V_proj.retain_grad()\n", "\n", " # 2) Apply attention on all the projected vectors in batch.\n", " self.attn_out, self.attn = TorchSDPAttentionLayer().forward(\n", " self.Q_proj, self.K_proj, self.V_proj, mask=mask\n", " )\n", " self.attn.retain_grad()\n", " self.attn_out.retain_grad()\n", "\n", " # 3) \"Concat\" using a view and apply a final linear transformation\n", " self.attn_out_reshaped = (\n", " self.attn_out.transpose(1, 2)\n", " .contiguous()\n", " .view(n_ex, -1, self.n_heads * self.latent_dim)\n", " )\n", " self.attn_out_reshaped.retain_grad()\n", " print(self.attn_out_reshaped.shape)\n", " self.Y = self.projections[\"O\"](self.attn_out_reshaped)\n", " print(self.Y.shape)\n", " self.Y.retain_grad()\n", "\n", " def extract_grads(self, Q, K, V, mask=None):\n", " self.forward(Q, K, V, mask=mask)\n", " self.loss1 = self.Y.sum()\n", " self.loss1.backward()\n", " grads = {\n", " \"Q\": self.Q.detach().numpy(),\n", " \"K\": self.K.detach().numpy(),\n", " \"V\": self.V.detach().numpy(),\n", " \"O_W\": self.projections[\"O\"].weight.detach().numpy().T,\n", " \"V_W\": self.projections[\"V\"].weight.detach().numpy().T,\n", " \"K_W\": self.projections[\"K\"].weight.detach().numpy().T,\n", " \"Q_W\": self.projections[\"Q\"].weight.detach().numpy().T,\n", " \"O_b\": self.projections[\"O\"].bias.detach().numpy(),\n", " \"V_b\": self.projections[\"V\"].bias.detach().numpy(),\n", " \"K_b\": self.projections[\"K\"].bias.detach().numpy(),\n", " \"Q_b\": self.projections[\"Q\"].bias.detach().numpy(),\n", " \"latent_dim\": self.latent_dim,\n", " \"n_heads\": self.n_heads,\n", " \"Q_proj\": self.Q_proj.detach().numpy(), # .reshape(self.Q_proj.shape[0], -1),\n", " \"K_proj\": self.K_proj.detach().numpy(), # .reshape(self.K_proj.shape[0], -1),\n", " \"V_proj\": self.V_proj.detach().numpy(), # .reshape(self.V_proj.shape[0], -1),\n", " \"weights\": self.attn.detach().numpy(),\n", " \"attn_out\": self.attn_out_reshaped.detach().numpy(), # .squeeze(),\n", " # .reshape(self.attn_out_reshaped.shape[0], -1),\n", " \"Y\": self.Y.detach().numpy(),\n", " \"dO_W\": self.projections[\"O\"].weight.grad.numpy().T,\n", " \"dV_W\": self.projections[\"V\"].weight.grad.numpy().T,\n", " \"dK_W\": self.projections[\"K\"].weight.grad.numpy().T,\n", " \"dQ_W\": self.projections[\"Q\"].weight.grad.numpy().T,\n", " \"dO_b\": self.projections[\"O\"].bias.grad.numpy(),\n", " \"dV_b\": self.projections[\"V\"].bias.grad.numpy(),\n", " \"dK_b\": self.projections[\"K\"].bias.grad.numpy(),\n", " \"dQ_b\": self.projections[\"Q\"].bias.grad.numpy(),\n", " \"dLdy\": self.Y.grad.numpy(),\n", " \"dAttn_out\": self.attn_out_reshaped.grad.numpy(),\n", " \"dWeights\": self.attn.grad.numpy(),\n", " \"dQ_proj\": self.Q_proj.grad.numpy(),\n", " \"dK_proj\": self.K_proj.grad.numpy(),\n", " \"dV_proj\": self.V_proj.grad.numpy(),\n", " \"dQ\": self.Q.grad.numpy(),\n", " \"dK\": self.K.grad.numpy(),\n", " \"dV\": self.V.grad.numpy(),\n", " }\n", " return grads\n", "\n", "\n", "#######################################################################\n", "# TF WGAN GP Gold Standard Implementation #\n", "# adapted from: https://github.com/igul222/improved_wgan_training/ #\n", "#######################################################################\n", "\n", "_params = {}\n", "_param_aliases = {}\n", "\n", "\n", "def param(name, *args, **kwargs):\n", " \"\"\"\n", " A wrapper for `tf.Variable` which enables parameter sharing in models.\n", "\n", " Creates and returns theano shared variables similarly to `tf.Variable`,\n", " except if you try to create a param with the same name as a\n", " previously-created one, `param(...)` will just return the old one instead of\n", " making a new one.\n", "\n", " This constructor also adds a `param` attribute to the shared variables it\n", " creates, so that you can easily search a graph for all params.\n", " \"\"\"\n", "\n", " if name not in _params:\n", " kwargs[\"name\"] = name\n", " param = tf.Variable(*args, **kwargs)\n", " param.param = True\n", " _params[name] = param\n", " result = _params[name]\n", " i = 0\n", " while result in _param_aliases:\n", " i += 1\n", " result = _param_aliases[result]\n", " return result\n", "\n", "\n", "def params_with_name(name):\n", " return [p for n, p in _params.items() if name in n]\n", "\n", "\n", "def ReLULayer(name, n_in, n_out, inputs, w_initialization):\n", " if isinstance(w_initialization, np.ndarray):\n", " weight_values = w_initialization.astype(\"float32\")\n", "\n", " W = param(name + \".W\", weight_values)\n", " result = tf.matmul(inputs, W)\n", " output = tf.nn.bias_add(\n", " result, param(name + \".b\", np.zeros((n_out,), dtype=\"float32\"))\n", " )\n", " output = tf.nn.relu(output)\n", " return output, W\n", "\n", "\n", "def LinearLayer(name, n_in, n_out, inputs, w_initialization):\n", " if isinstance(w_initialization, np.ndarray):\n", " weight_values = w_initialization.astype(\"float32\")\n", "\n", " W = param(name + \".W\", weight_values)\n", " result = tf.matmul(inputs, W)\n", " output = tf.nn.bias_add(\n", " result, param(name + \".b\", np.zeros((n_out,), dtype=\"float32\"))\n", " )\n", " return output, W\n", "\n", "\n", "def Generator(n_samples, X_real, params=None):\n", " n_feats = 2\n", " W1 = W2 = W3 = W4 = \"he\"\n", " noise = tf.random.normal([n_samples, 2])\n", " if params is not None:\n", " noise = tf.convert_to_tensor(params[\"noise\"], dtype=\"float32\")\n", " W1 = params[\"generator\"][\"FC1\"][\"W\"]\n", " W2 = params[\"generator\"][\"FC2\"][\"W\"]\n", " W3 = params[\"generator\"][\"FC3\"][\"W\"]\n", " W4 = params[\"generator\"][\"FC4\"][\"W\"]\n", " DIM = params[\"g_hidden\"]\n", " n_feats = params[\"n_in\"]\n", "\n", " outs = {}\n", " weights = {}\n", " output, W = ReLULayer(\"Generator.1\", n_feats, DIM, noise, w_initialization=W1)\n", " outs[\"FC1\"] = output\n", " weights[\"FC1\"] = W\n", " output, W = ReLULayer(\"Generator.2\", DIM, DIM, output, w_initialization=W2)\n", " outs[\"FC2\"] = output\n", " weights[\"FC2\"] = W\n", " output, W = ReLULayer(\"Generator.3\", DIM, DIM, output, w_initialization=W3)\n", " outs[\"FC3\"] = output\n", " weights[\"FC3\"] = W\n", " output, W = LinearLayer(\"Generator.4\", DIM, n_feats, output, w_initialization=W4)\n", " outs[\"FC4\"] = output\n", " weights[\"FC4\"] = W\n", " return output, outs, weights\n", "\n", "\n", "def Discriminator(inputs, params=None):\n", " n_feats = 2\n", " W1 = W2 = W3 = W4 = \"he\"\n", " if params is not None:\n", " W1 = params[\"critic\"][\"FC1\"][\"W\"]\n", " W2 = params[\"critic\"][\"FC2\"][\"W\"]\n", " W3 = params[\"critic\"][\"FC3\"][\"W\"]\n", " W4 = params[\"critic\"][\"FC4\"][\"W\"]\n", " DIM = params[\"g_hidden\"]\n", " n_feats = params[\"n_in\"]\n", "\n", " outs = {}\n", " weights = {}\n", " output, W = ReLULayer(\"Discriminator.1\", n_feats, DIM, inputs, w_initialization=W1)\n", " outs[\"FC1\"] = output\n", " weights[\"FC1\"] = W\n", "\n", " output, W = ReLULayer(\"Discriminator.2\", DIM, DIM, output, w_initialization=W2)\n", " outs[\"FC2\"] = output\n", " weights[\"FC2\"] = W\n", "\n", " output, W = ReLULayer(\"Discriminator.3\", DIM, DIM, output, w_initialization=W3)\n", " outs[\"FC3\"] = output\n", " weights[\"FC3\"] = W\n", "\n", " output, W = LinearLayer(\"Discriminator.4\", DIM, 1, output, w_initialization=W4)\n", " outs[\"FC4\"] = output\n", " weights[\"FC4\"] = W\n", "\n", " # get bias\n", " for var in params_with_name(\"Discriminator\"):\n", " if \"1.b:\" in var.name:\n", " weights[\"FC1_b\"] = var\n", " elif \"2.b:\" in var.name:\n", " weights[\"FC2_b\"] = var\n", " elif \"3.b:\" in var.name:\n", " weights[\"FC3_b\"] = var\n", " elif \"4.b:\" in var.name:\n", " weights[\"FC4_b\"] = var\n", "\n", " return tf.reshape(output, [-1]), outs, weights\n", "\n", "\n", "def WGAN_GP_tf(X, lambda_, params, batch_size):\n", " tf.compat.v1.disable_eager_execution()\n", "\n", " batch_size = X.shape[0]\n", "\n", " # get alpha value\n", " n_steps = params[\"n_steps\"]\n", " c_updates_per_epoch = params[\"c_updates_per_epoch\"]\n", " alpha = tf.convert_to_tensor(params[\"alpha\"], dtype=\"float32\")\n", "\n", " X_real = tf.compat.v1.placeholder(tf.float32, shape=[None, params[\"n_in\"]])\n", " X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params)\n", "\n", " Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params)\n", " Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params)\n", "\n", " # WGAN loss\n", " mean_fake = tf.reduce_mean(Y_fake)\n", " mean_real = tf.reduce_mean(Y_real)\n", "\n", " C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real)\n", " G_loss = -tf.reduce_mean(Y_fake)\n", "\n", " # WGAN gradient penalty\n", " X_interp = alpha * X_real + ((1 - alpha) * X_fake)\n", " Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params)\n", " gradInterp = tf.gradients(Y_interp, [X_interp])[0]\n", "\n", " norm_gradInterp = tf.sqrt(\n", " tf.compat.v1.reduce_sum(tf.square(gradInterp), reduction_indices=[1])\n", " )\n", " gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2)\n", " C_loss += lambda_ * gradient_penalty\n", "\n", " # extract gradient of Y_interp wrt. each layer output in critic\n", " C_bwd_Y_interp = {}\n", " for k, v in C_out_Y_interp.items():\n", " C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0]\n", "\n", " C_bwd_W = {}\n", " for k, v in C_Y_interp_weights.items():\n", " C_bwd_W[k] = tf.gradients(C_loss, [v])[0]\n", "\n", " # get gradients\n", " dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0]\n", " dC_Y_real = tf.gradients(C_loss, [Y_real])[0]\n", " dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0]\n", " dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0]\n", "\n", " with tf.compat.v1.Session() as session:\n", " session.run(tf.compat.v1.global_variables_initializer())\n", "\n", " for iteration in range(n_steps):\n", " # Train critic\n", " for i in range(c_updates_per_epoch):\n", " _data = X\n", " (\n", " _alpha,\n", " _X_interp,\n", " _Y_interp,\n", " _gradInterp,\n", " _norm_gradInterp,\n", " _gradient_penalty,\n", " _C_loss,\n", " _X_fake,\n", " _Y_fake,\n", " _Y_real,\n", " _dC_Y_fake,\n", " _dC_Y_real,\n", " _dC_gradInterp,\n", " _dG_Y_fake,\n", " _mean_fake,\n", " _mean_real,\n", " _G_weights_FC1,\n", " _G_weights_FC2,\n", " _G_weights_FC3,\n", " _G_weights_FC4,\n", " _G_fwd_X_fake_FC1,\n", " _G_fwd_X_fake_FC2,\n", " _G_fwd_X_fake_FC3,\n", " _G_fwd_X_fake_FC4,\n", " _C_weights_Y_fake_FC1,\n", " _C_weights_Y_fake_FC2,\n", " _C_weights_Y_fake_FC3,\n", " _C_weights_Y_fake_FC4,\n", " _C_fwd_Y_fake_FC1,\n", " _C_fwd_Y_fake_FC2,\n", " _C_fwd_Y_fake_FC3,\n", " _C_fwd_Y_fake_FC4,\n", " _C_weights_Y_real_FC1,\n", " _C_weights_Y_real_FC2,\n", " _C_weights_Y_real_FC3,\n", " _C_weights_Y_real_FC4,\n", " _C_fwd_Y_real_FC1,\n", " _C_fwd_Y_real_FC2,\n", " _C_fwd_Y_real_FC3,\n", " _C_fwd_Y_real_FC4,\n", " _C_weights_Y_interp_FC1,\n", " _C_weights_Y_interp_FC2,\n", " _C_weights_Y_interp_FC3,\n", " _C_weights_Y_interp_FC4,\n", " _C_dY_interp_wrt_FC1,\n", " _C_dY_interp_wrt_FC2,\n", " _C_dY_interp_wrt_FC3,\n", " _C_dY_interp_wrt_FC4,\n", " _C_fwd_Y_interp_FC1,\n", " _C_fwd_Y_interp_FC2,\n", " _C_fwd_Y_interp_FC3,\n", " _C_fwd_Y_interp_FC4,\n", " _C_dW_FC1,\n", " _C_db_FC1,\n", " _C_dW_FC2,\n", " _C_db_FC2,\n", " _C_dW_FC3,\n", " _C_db_FC3,\n", " _C_dW_FC4,\n", " _C_db_FC4,\n", " ) = session.run(\n", " [\n", " alpha,\n", " X_interp,\n", " Y_interp,\n", " gradInterp,\n", " norm_gradInterp,\n", " gradient_penalty,\n", " C_loss,\n", " X_fake,\n", " Y_fake,\n", " Y_real,\n", " dC_Y_fake,\n", " dC_Y_real,\n", " dC_gradInterp,\n", " dG_Y_fake,\n", " mean_fake,\n", " mean_real,\n", " G_weights[\"FC1\"],\n", " G_weights[\"FC2\"],\n", " G_weights[\"FC3\"],\n", " G_weights[\"FC4\"],\n", " G_out_X_fake[\"FC1\"],\n", " G_out_X_fake[\"FC2\"],\n", " G_out_X_fake[\"FC3\"],\n", " G_out_X_fake[\"FC4\"],\n", " C_Y_fake_weights[\"FC1\"],\n", " C_Y_fake_weights[\"FC2\"],\n", " C_Y_fake_weights[\"FC3\"],\n", " C_Y_fake_weights[\"FC4\"],\n", " C_out_Y_fake[\"FC1\"],\n", " C_out_Y_fake[\"FC2\"],\n", " C_out_Y_fake[\"FC3\"],\n", " C_out_Y_fake[\"FC4\"],\n", " C_Y_real_weights[\"FC1\"],\n", " C_Y_real_weights[\"FC2\"],\n", " C_Y_real_weights[\"FC3\"],\n", " C_Y_real_weights[\"FC4\"],\n", " C_out_Y_real[\"FC1\"],\n", " C_out_Y_real[\"FC2\"],\n", " C_out_Y_real[\"FC3\"],\n", " C_out_Y_real[\"FC4\"],\n", " C_Y_interp_weights[\"FC1\"],\n", " C_Y_interp_weights[\"FC2\"],\n", " C_Y_interp_weights[\"FC3\"],\n", " C_Y_interp_weights[\"FC4\"],\n", " C_bwd_Y_interp[\"FC1\"],\n", " C_bwd_Y_interp[\"FC2\"],\n", " C_bwd_Y_interp[\"FC3\"],\n", " C_bwd_Y_interp[\"FC4\"],\n", " C_out_Y_interp[\"FC1\"],\n", " C_out_Y_interp[\"FC2\"],\n", " C_out_Y_interp[\"FC3\"],\n", " C_out_Y_interp[\"FC4\"],\n", " C_bwd_W[\"FC1\"],\n", " C_bwd_W[\"FC1_b\"],\n", " C_bwd_W[\"FC2\"],\n", " C_bwd_W[\"FC2_b\"],\n", " C_bwd_W[\"FC3\"],\n", " C_bwd_W[\"FC3_b\"],\n", " C_bwd_W[\"FC4\"],\n", " C_bwd_W[\"FC4_b\"],\n", " ],\n", " feed_dict={X_real: _data},\n", " )\n", "\n", " _G_loss = session.run(G_loss, feed_dict={X_real: _data})\n", "\n", " grads = {\n", " \"X_real\": _data,\n", " \"X_interp\": _X_interp,\n", " \"G_weights_FC1\": _G_weights_FC1,\n", " \"G_weights_FC2\": _G_weights_FC2,\n", " \"G_weights_FC3\": _G_weights_FC3,\n", " \"G_weights_FC4\": _G_weights_FC4,\n", " \"G_fwd_X_fake_FC1\": _G_fwd_X_fake_FC1,\n", " \"G_fwd_X_fake_FC2\": _G_fwd_X_fake_FC2,\n", " \"G_fwd_X_fake_FC3\": _G_fwd_X_fake_FC3,\n", " \"G_fwd_X_fake_FC4\": _G_fwd_X_fake_FC4,\n", " \"X_fake\": _X_fake,\n", " \"C_weights_Y_fake_FC1\": _C_weights_Y_fake_FC1,\n", " \"C_weights_Y_fake_FC2\": _C_weights_Y_fake_FC2,\n", " \"C_weights_Y_fake_FC3\": _C_weights_Y_fake_FC3,\n", " \"C_weights_Y_fake_FC4\": _C_weights_Y_fake_FC4,\n", " \"C_fwd_Y_fake_FC1\": _C_fwd_Y_fake_FC1,\n", " \"C_fwd_Y_fake_FC2\": _C_fwd_Y_fake_FC2,\n", " \"C_fwd_Y_fake_FC3\": _C_fwd_Y_fake_FC3,\n", " \"C_fwd_Y_fake_FC4\": _C_fwd_Y_fake_FC4,\n", " \"Y_fake\": _Y_fake,\n", " \"C_weights_Y_real_FC1\": _C_weights_Y_real_FC1,\n", " \"C_weights_Y_real_FC2\": _C_weights_Y_real_FC2,\n", " \"C_weights_Y_real_FC3\": _C_weights_Y_real_FC3,\n", " \"C_weights_Y_real_FC4\": _C_weights_Y_real_FC4,\n", " \"C_fwd_Y_real_FC1\": _C_fwd_Y_real_FC1,\n", " \"C_fwd_Y_real_FC2\": _C_fwd_Y_real_FC2,\n", " \"C_fwd_Y_real_FC3\": _C_fwd_Y_real_FC3,\n", " \"C_fwd_Y_real_FC4\": _C_fwd_Y_real_FC4,\n", " \"Y_real\": _Y_real,\n", " \"C_weights_Y_interp_FC1\": _C_weights_Y_interp_FC1,\n", " \"C_weights_Y_interp_FC2\": _C_weights_Y_interp_FC2,\n", " \"C_weights_Y_interp_FC3\": _C_weights_Y_interp_FC3,\n", " \"C_weights_Y_interp_FC4\": _C_weights_Y_interp_FC4,\n", " \"C_fwd_Y_interp_FC1\": _C_fwd_Y_interp_FC1,\n", " \"C_fwd_Y_interp_FC2\": _C_fwd_Y_interp_FC2,\n", " \"C_fwd_Y_interp_FC3\": _C_fwd_Y_interp_FC3,\n", " \"C_fwd_Y_interp_FC4\": _C_fwd_Y_interp_FC4,\n", " \"Y_interp\": _Y_interp,\n", " \"dY_interp_wrt_FC1\": _C_dY_interp_wrt_FC1,\n", " \"dY_interp_wrt_FC2\": _C_dY_interp_wrt_FC2,\n", " \"dY_interp_wrt_FC3\": _C_dY_interp_wrt_FC3,\n", " \"dY_interp_wrt_FC4\": _C_dY_interp_wrt_FC4,\n", " \"gradInterp\": _gradInterp,\n", " \"gradInterp_norm\": _norm_gradInterp,\n", " \"G_loss\": _G_loss,\n", " \"C_loss\": _C_loss,\n", " \"dC_loss_dW_FC1\": _C_dW_FC1,\n", " \"dC_loss_db_FC1\": _C_db_FC1,\n", " \"dC_loss_dW_FC2\": _C_dW_FC2,\n", " \"dC_loss_db_FC2\": _C_db_FC2,\n", " \"dC_loss_dW_FC3\": _C_dW_FC3,\n", " \"dC_loss_db_FC3\": _C_db_FC3,\n", " \"dC_loss_dW_FC4\": _C_dW_FC4,\n", " \"dC_loss_db_FC4\": _C_db_FC4,\n", " \"dC_Y_fake\": _dC_Y_fake,\n", " \"dC_Y_real\": _dC_Y_real,\n", " \"dC_gradInterp\": _dC_gradInterp,\n", " \"dG_Y_fake\": _dG_Y_fake,\n", " }\n", " return grads\n", "\n", "\n", "def TFNCELoss(X, target_word, L):\n", " from tensorflow.python.ops.nn_impl import _compute_sampled_logits\n", " from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits\n", "\n", " tf.compat.v1.disable_eager_execution()\n", "\n", " in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape)\n", " in_bias = tf.compat.v1.placeholder(\n", " tf.float32, shape=L.parameters[\"b\"].flatten().shape\n", " )\n", " in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.parameters[\"W\"].shape)\n", " in_target_word = tf.compat.v1.placeholder(tf.int64)\n", " in_neg_samples = tf.compat.v1.placeholder(tf.int32)\n", " in_target_prob = tf.compat.v1.placeholder(tf.float32)\n", " in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32)\n", "\n", " # in_embed = tf.keras.Input(dtype=tf.float32, shape=X.shape)\n", " # in_bias = tf.keras.Input(dtype=tf.float32, shape=L.parameters[\"b\"].flatten().shape)\n", " # in_weights = tf.keras.Input(dtype=tf.float32, shape=L.parameters[\"W\"].shape)\n", " # in_target_word = tf.keras.Input(dtype=tf.int64, shape=())\n", " # in_neg_samples = tf.keras.Input(dtype=tf.int32, shape=())\n", " # in_target_prob = tf.keras.Input(dtype=tf.float32, shape=())\n", " # in_neg_samp_prob = tf.keras.Input(dtype=tf.float32, shape=())\n", "\n", " feed = {\n", " in_embed: X,\n", " in_weights: L.parameters[\"W\"],\n", " in_target_word: target_word,\n", " in_bias: L.parameters[\"b\"].flatten(),\n", " in_neg_samples: L.derived_variables[\"noise_samples\"][0],\n", " in_target_prob: L.derived_variables[\"noise_samples\"][1],\n", " in_neg_samp_prob: L.derived_variables[\"noise_samples\"][2],\n", " }\n", "\n", " # Compute the NCE loss, using a sample of the negative labels each time.\n", " nce_unreduced = tf.nn.nce_loss(\n", " weights=in_weights,\n", " biases=in_bias,\n", " labels=in_target_word,\n", " inputs=in_embed,\n", " sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),\n", " num_sampled=L.num_negative_samples,\n", " num_classes=L.n_classes,\n", " )\n", "\n", " loss = tf.reduce_sum(nce_unreduced)\n", " dLdW = tf.gradients(loss, [in_weights])[0]\n", " dLdb = tf.gradients(loss, [in_bias])[0]\n", " dLdX = tf.gradients(loss, [in_embed])[0]\n", "\n", " sampled_logits, sampled_labels = _compute_sampled_logits(\n", " weights=in_weights,\n", " biases=in_bias,\n", " labels=in_target_word,\n", " inputs=in_embed,\n", " sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob),\n", " num_sampled=L.num_negative_samples,\n", " num_classes=L.n_classes,\n", " num_true=1,\n", " subtract_log_q=True,\n", " )\n", "\n", " sampled_losses = sigmoid_cross_entropy_with_logits(\n", " labels=sampled_labels, logits=sampled_logits\n", " )\n", "\n", " with tf.compat.v1.Session() as session:\n", " session.run(tf.compat.v1.global_variables_initializer())\n", " (\n", " _final_loss,\n", " _nce_unreduced,\n", " _dLdW,\n", " _dLdb,\n", " _dLdX,\n", " _sampled_logits,\n", " _sampled_labels,\n", " _sampled_losses,\n", " ) = session.run(\n", " [\n", " loss,\n", " nce_unreduced,\n", " dLdW,\n", " dLdb,\n", " dLdX,\n", " sampled_logits,\n", " sampled_labels,\n", " sampled_losses,\n", " ],\n", " feed_dict=feed,\n", " )\n", " tf.compat.v1.reset_default_graph()\n", " return {\n", " \"final_loss\": _final_loss,\n", " \"nce_unreduced\": _nce_unreduced,\n", " \"dLdW\": _dLdW,\n", " \"dLdb\": _dLdb,\n", " \"dLdX\": _dLdX,\n", " \"out_logits\": _sampled_logits,\n", " \"out_labels\": _sampled_labels,\n", " \"sampled_loss\": _sampled_losses,\n", " }\n"]} {"path": "numpy_ml/o_tests/test_nn_activations.py", "content": ["# flake8: noqa\n", "import time\n", "import numpy as np\n", "\n", "from numpy.testing import assert_almost_equal\n", "from scipy.special import expit\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from numpy_ml.utils.testing import random_stochastic_matrix, random_tensor\n", "\n", "\n", "def torch_gradient_generator(fn, **kwargs):\n", " def get_grad(z):\n", " z1 = torch.autograd.Variable(torch.from_numpy(z), requires_grad=True)\n", " z2 = fn(z1, **kwargs).sum()\n", " z2.backward()\n", " grad = z1.grad.numpy()\n", " return grad\n", "\n", " return get_grad\n", "\n", "\n", "#######################################################################\n", "# Debug Formatter #\n", "#######################################################################\n", "\n", "\n", "def err_fmt(params, golds, ix, warn_str=\"\"):\n", " mine, label = params[ix]\n", " err_msg = \"-\" * 25 + \" DEBUG \" + \"-\" * 25 + \"\\n\"\n", " prev_mine, prev_label = params[max(ix - 1, 0)]\n", " err_msg += \"Mine (prev) [{}]:\\n{}\\n\\nTheirs (prev) [{}]:\\n{}\".format(\n", " prev_label, prev_mine, prev_label, golds[prev_label]\n", " )\n", " err_msg += \"\\n\\nMine [{}]:\\n{}\\n\\nTheirs [{}]:\\n{}\".format(\n", " label, mine, label, golds[label]\n", " )\n", " err_msg += warn_str\n", " err_msg += \"\\n\" + \"-\" * 23 + \" END DEBUG \" + \"-\" * 23\n", " return err_msg\n", "\n", "\n", "#######################################################################\n", "# Test Suite #\n", "#######################################################################\n", "#\n", "#\n", "# def test_activations(N=50):\n", "# print(\"Testing Sigmoid activation\")\n", "# time.sleep(1)\n", "# test_sigmoid_activation(N)\n", "# test_sigmoid_grad(N)\n", "#\n", "# # print(\"Testing Softmax activation\")\n", "# # time.sleep(1)\n", "# # test_softmax_activation(N)\n", "# # test_softmax_grad(N)\n", "#\n", "# print(\"Testing Tanh activation\")\n", "# time.sleep(1)\n", "# test_tanh_grad(N)\n", "#\n", "# print(\"Testing ReLU activation\")\n", "# time.sleep(1)\n", "# test_relu_activation(N)\n", "# test_relu_grad(N)\n", "#\n", "# print(\"Testing ELU activation\")\n", "# time.sleep(1)\n", "# test_elu_activation(N)\n", "# test_elu_grad(N)\n", "#\n", "# print(\"Testing SELU activation\")\n", "# time.sleep(1)\n", "# test_selu_activation(N)\n", "# test_selu_grad(N)\n", "#\n", "# print(\"Testing LeakyRelu activation\")\n", "# time.sleep(1)\n", "# test_leakyrelu_activation(N)\n", "# test_leakyrelu_grad(N)\n", "#\n", "# print(\"Testing SoftPlus activation\")\n", "# time.sleep(1)\n", "# test_softplus_activation(N)\n", "# test_softplus_grad(N)\n", "#\n", "\n", "#######################################################################\n", "# Activations #\n", "#######################################################################\n", "\n", "\n", "def test_sigmoid_activation(N=50):\n", " from numpy_ml.neural_nets.activations import Sigmoid\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Sigmoid()\n", " gold = expit\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((1, n_dims))\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_softplus_activation(N=50):\n", " from numpy_ml.neural_nets.activations import SoftPlus\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SoftPlus()\n", " gold = lambda z: F.softplus(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_elu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import ELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 10)\n", " z = random_tensor((1, n_dims))\n", "\n", " alpha = np.random.uniform(0, 10)\n", "\n", " mine = ELU(alpha)\n", " gold = lambda z, a: F.elu(torch.from_numpy(z), alpha).numpy()\n", "\n", " assert_almost_equal(mine.fn(z), gold(z, alpha))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_relu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import ReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = ReLU()\n", " gold = lambda z: F.relu(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_selu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import SELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SELU()\n", " gold = lambda z: F.selu(torch.FloatTensor(z)).numpy()\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " assert_almost_equal(mine.fn(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_leakyrelu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import LeakyReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " alpha = np.random.uniform(0, 10)\n", "\n", " mine = LeakyReLU(alpha=alpha)\n", " gold = lambda z: F.leaky_relu(torch.FloatTensor(z), alpha).numpy()\n", " assert_almost_equal(mine.fn(z), gold(z))\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_gelu_activation(N=50):\n", " from numpy_ml.neural_nets.activations import GELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_dims = np.random.randint(1, 100)\n", " z = random_stochastic_matrix(1, n_dims)\n", " approx = np.random.choice([True, False])\n", "\n", " mine = GELU(approximate=False)\n", " mine_approx = GELU(approximate=True)\n", " gold = lambda z: F.gelu(torch.FloatTensor(z)).numpy()\n", " np.testing.assert_allclose(mine.fn(z), gold(z), rtol=1e-3)\n", " assert_almost_equal(mine.fn(z), mine_approx.fn(z))\n", "\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "#######################################################################\n", "# Activation Gradients #\n", "#######################################################################\n", "\n", "\n", "def test_sigmoid_grad(N=50):\n", " from numpy_ml.neural_nets.activations import Sigmoid\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Sigmoid()\n", " gold = torch_gradient_generator(torch.sigmoid)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_elu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import ELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " n_dims = np.random.randint(1, 10)\n", " alpha = np.random.uniform(0, 10)\n", " z = random_tensor((n_ex, n_dims))\n", "\n", " mine = ELU(alpha)\n", " gold = torch_gradient_generator(F.elu, alpha=alpha)\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=6)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_tanh_grad(N=50):\n", " from numpy_ml.neural_nets.activations import Tanh\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = Tanh()\n", " gold = torch_gradient_generator(torch.tanh)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_relu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import ReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = ReLU()\n", " gold = torch_gradient_generator(F.relu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_gelu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import GELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = GELU(approximate=False)\n", " mine_approx = GELU(approximate=True)\n", " gold = torch_gradient_generator(F.gelu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=3)\n", " assert_almost_equal(mine.grad(z), mine_approx.grad(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_selu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import SELU\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SELU()\n", " gold = torch_gradient_generator(F.selu)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims))\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=6)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_leakyrelu_grad(N=50):\n", " from numpy_ml.neural_nets.activations import LeakyReLU\n", "\n", " N = np.inf if N is None else N\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 10)\n", " n_dims = np.random.randint(1, 10)\n", " alpha = np.random.uniform(0, 10)\n", " z = random_tensor((n_ex, n_dims))\n", "\n", " mine = LeakyReLU(alpha)\n", " gold = torch_gradient_generator(F.leaky_relu, negative_slope=alpha)\n", " assert_almost_equal(mine.grad(z), gold(z), decimal=6)\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "def test_softplus_grad(N=50):\n", " from numpy_ml.neural_nets.activations import SoftPlus\n", "\n", " N = np.inf if N is None else N\n", "\n", " mine = SoftPlus()\n", " gold = torch_gradient_generator(F.softplus)\n", "\n", " i = 0\n", " while i < N:\n", " n_ex = np.random.randint(1, 100)\n", " n_dims = np.random.randint(1, 100)\n", " z = random_tensor((n_ex, n_dims), standardize=True)\n", " assert_almost_equal(mine.grad(z), gold(z))\n", " print(\"PASSED\")\n", " i += 1\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test_activations(N=50)\n"]} {"path": "numpy_ml/utils/misc.py", "content": ["\"\"\"Miscellaneous utility functions\"\"\"\n", "import numpy as np\n", "\n", "\n", "def logsumexp(log_probs, axis=None):\n", " \"\"\"\n", " Redefine scipy.special.logsumexp\n", " see: http://bayesjumping.net/log-sum-exp-trick/\n", " \"\"\"\n", " _max = np.max(log_probs)\n", " ds = log_probs - _max\n", " exp_sum = np.exp(ds).sum(axis=axis)\n", " return _max + np.log(exp_sum)\n", "\n", "\n", "def log_gaussian_pdf(x_i, mu, sigma):\n", " \"\"\"Compute log N(x_i | mu, sigma)\"\"\"\n", " n = len(mu)\n", " a = n * np.log(2 * np.pi)\n", " _, b = np.linalg.slogdet(sigma)\n", "\n", " y = np.linalg.solve(sigma, x_i - mu)\n", " c = np.dot(x_i - mu, y)\n", " return -0.5 * (a + b + c)\n"]} {"path": "numpy_ml/utils/__init__.py", "content": ["\"\"\"Utilities module\"\"\"\n", "\n", "from . import testing\n", "from . import data_structures\n", "from . import distance_metrics\n", "from . import kernels\n", "from . import windows\n", "from . import graphs\n", "from . import misc\n"]} {"path": "numpy_ml/utils/distance_metrics.py", "content": ["import numpy as np\n", "\n", "\n", "def euclidean(x, y):\n", " \"\"\"\n", " Compute the Euclidean (`L2`) distance between two real vectors\n", "\n", " Notes\n", " -----\n", " The Euclidean distance between two vectors **x** and **y** is\n", "\n", " .. math::\n", "\n", " d(\\mathbf{x}, \\mathbf{y}) = \\sqrt{ \\sum_i (x_i - y_i)^2 }\n", "\n", " Parameters\n", " ----------\n", " x,y : :py:class:`ndarray ` s of shape `(N,)`\n", " The two vectors to compute the distance between\n", "\n", " Returns\n", " -------\n", " d : float\n", " The L2 distance between **x** and **y**.\n", " \"\"\"\n", " return np.sqrt(np.sum((x - y) ** 2))\n", "\n", "\n", "def manhattan(x, y):\n", " \"\"\"\n", " Compute the Manhattan (`L1`) distance between two real vectors\n", "\n", " Notes\n", " -----\n", " The Manhattan distance between two vectors **x** and **y** is\n", "\n", " .. math::\n", "\n", " d(\\mathbf{x}, \\mathbf{y}) = \\sum_i |x_i - y_i|\n", "\n", " Parameters\n", " ----------\n", " x,y : :py:class:`ndarray ` s of shape `(N,)`\n", " The two vectors to compute the distance between\n", "\n", " Returns\n", " -------\n", " d : float\n", " The L1 distance between **x** and **y**.\n", " \"\"\"\n", " return np.sum(np.abs(x - y))\n", "\n", "\n", "def chebyshev(x, y):\n", " \"\"\"\n", " Compute the Chebyshev (:math:`L_\\infty`) distance between two real vectors\n", "\n", " Notes\n", " -----\n", " The Chebyshev distance between two vectors **x** and **y** is\n", "\n", " .. math::\n", "\n", " d(\\mathbf{x}, \\mathbf{y}) = \\max_i |x_i - y_i|\n", "\n", " Parameters\n", " ----------\n", " x,y : :py:class:`ndarray ` s of shape `(N,)`\n", " The two vectors to compute the distance between\n", "\n", " Returns\n", " -------\n", " d : float\n", " The Chebyshev distance between **x** and **y**.\n", " \"\"\"\n", " return np.max(np.abs(x - y))\n", "\n", "\n", "def minkowski(x, y, p):\n", " \"\"\"\n", " Compute the Minkowski-`p` distance between two real vectors.\n", "\n", " Notes\n", " -----\n", " The Minkowski-`p` distance between two vectors **x** and **y** is\n", "\n", " .. math::\n", "\n", " d(\\mathbf{x}, \\mathbf{y}) = \\left( \\sum_i |x_i - y_i|^p \\\\right)^{1/p}\n", "\n", " Parameters\n", " ----------\n", " x,y : :py:class:`ndarray ` s of shape `(N,)`\n", " The two vectors to compute the distance between\n", " p : float > 1\n", " The parameter of the distance function. When `p = 1`, this is the `L1`\n", " distance, and when `p=2`, this is the `L2` distance. For `p < 1`,\n", " Minkowski-`p` does not satisfy the triangle inequality and hence is not\n", " a valid distance metric.\n", "\n", " Returns\n", " -------\n", " d : float\n", " The Minkowski-`p` distance between **x** and **y**.\n", " \"\"\"\n", " return np.sum(np.abs(x - y) ** p) ** (1 / p)\n", "\n", "\n", "def hamming(x, y):\n", " \"\"\"\n", " Compute the Hamming distance between two integer-valued vectors.\n", "\n", " Notes\n", " -----\n", " The Hamming distance between two vectors **x** and **y** is\n", "\n", " .. math::\n", "\n", " d(\\mathbf{x}, \\mathbf{y}) = \\\\frac{1}{N} \\sum_i \\mathbb{1}_{x_i \\\\neq y_i}\n", "\n", " Parameters\n", " ----------\n", " x,y : :py:class:`ndarray ` s of shape `(N,)`\n", " The two vectors to compute the distance between. Both vectors should be\n", " integer-valued.\n", "\n", " Returns\n", " -------\n", " d : float\n", " The Hamming distance between **x** and **y**.\n", " \"\"\"\n", " return np.sum(x != y) / len(x)\n"]} {"path": "numpy_ml/utils/kernels.py", "content": ["import re\n", "from abc import ABC, abstractmethod\n", "\n", "import numpy as np\n", "\n", "\n", "class KernelBase(ABC):\n", " def __init__(self):\n", " super().__init__()\n", " self.parameters = {}\n", " self.hyperparameters = {}\n", "\n", " @abstractmethod\n", " def _kernel(self, X, Y):\n", " raise NotImplementedError\n", "\n", " def __call__(self, X, Y=None):\n", " \"\"\"Refer to documentation for the `_kernel` method\"\"\"\n", " return self._kernel(X, Y)\n", "\n", " def __str__(self):\n", " P, H = self.parameters, self.hyperparameters\n", " p_str = \", \".join([\"{}={}\".format(k, v) for k, v in P.items()])\n", " return \"{}({})\".format(H[\"id\"], p_str)\n", "\n", " def summary(self):\n", " \"\"\"Return the dictionary of model parameters, hyperparameters, and ID\"\"\"\n", " return {\n", " \"id\": self.hyperparameters[\"id\"],\n", " \"parameters\": self.parameters,\n", " \"hyperparameters\": self.hyperparameters,\n", " }\n", "\n", " def set_params(self, summary_dict):\n", " \"\"\"\n", " Set the model parameters and hyperparameters using the settings in\n", " `summary_dict`.\n", "\n", " Parameters\n", " ----------\n", " summary_dict : dict\n", " A dictionary with keys 'parameters' and 'hyperparameters',\n", " structured as would be returned by the :meth:`summary` method. If\n", " a particular (hyper)parameter is not included in this dict, the\n", " current value will be used.\n", "\n", " Returns\n", " -------\n", " new_kernel : :doc:`Kernel ` instance\n", " A kernel with parameters and hyperparameters adjusted to those\n", " specified in `summary_dict`.\n", " \"\"\"\n", " kr, sd = self, summary_dict\n", "\n", " # collapse `parameters` and `hyperparameters` nested dicts into a single\n", " # merged dictionary\n", " flatten_keys = [\"parameters\", \"hyperparameters\"]\n", " for k in flatten_keys:\n", " if k in sd:\n", " entry = sd[k]\n", " sd.update(entry)\n", " del sd[k]\n", "\n", " for k, v in sd.items():\n", " if k in self.parameters:\n", " kr.parameters[k] = v\n", " if k in self.hyperparameters:\n", " kr.hyperparameters[k] = v\n", " return kr\n", "\n", "\n", "class LinearKernel(KernelBase):\n", " def __init__(self, c0=0):\n", " \"\"\"\n", " The linear (i.e., dot-product) kernel.\n", "\n", " Notes\n", " -----\n", " For input vectors :math:`\\mathbf{x}` and :math:`\\mathbf{y}`, the linear\n", " kernel is:\n", "\n", " .. math::\n", "\n", " k(\\mathbf{x}, \\mathbf{y}) = \\mathbf{x}^\\\\top \\mathbf{y} + c_0\n", "\n", " Parameters\n", " ----------\n", " c0 : float\n", " An \"inhomogeneity\" parameter. When `c0` = 0, the kernel is said to be\n", " homogenous. Default is 1.\n", " \"\"\"\n", " super().__init__()\n", " self.hyperparameters = {\"id\": \"LinearKernel\"}\n", " self.parameters = {\"c0\": c0}\n", "\n", " def _kernel(self, X, Y=None):\n", " \"\"\"\n", " Compute the linear kernel (i.e., dot-product) between all pairs of rows in\n", " `X` and `Y`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " Collection of `N` input vectors\n", " Y : :py:class:`ndarray ` of shape `(M, C)` or None\n", " Collection of `M` input vectors. If None, assume `Y` = `X`.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(N, M)`\n", " Similarity between `X` and `Y`, where index (`i`, `j`) gives\n", " :math:`k(x_i, y_j)`.\n", " \"\"\"\n", " X, Y = kernel_checks(X, Y)\n", " return X @ Y.T + self.parameters[\"c0\"]\n", "\n", "\n", "class PolynomialKernel(KernelBase):\n", " def __init__(self, d=3, gamma=None, c0=1):\n", " \"\"\"\n", " The degree-`d` polynomial kernel.\n", "\n", " Notes\n", " -----\n", " For input vectors :math:`\\mathbf{x}` and :math:`\\mathbf{y}`, the polynomial\n", " kernel is:\n", "\n", " .. math::\n", "\n", " k(\\mathbf{x}, \\mathbf{y}) = (\\gamma \\mathbf{x}^\\\\top \\mathbf{y} + c_0)^d\n", "\n", " In contrast to the linear kernel, the polynomial kernel also computes\n", " similarities *across* dimensions of the **x** and **y** vectors,\n", " allowing it to account for interactions between features. As an\n", " instance of the dot product family of kernels, the polynomial kernel is\n", " invariant to a rotation of the coordinates about the origin, but *not*\n", " to translations.\n", "\n", " Parameters\n", " ----------\n", " d : int\n", " Degree of the polynomial kernel. Default is 3.\n", " gamma : float or None\n", " A scaling parameter for the dot product between `x` and `y`,\n", " determining the amount of smoothing/resonlution of the kernel.\n", " Larger values result in greater smoothing. If None, defaults to 1 /\n", " `C`. Sometimes referred to as the kernel bandwidth. Default is\n", " None.\n", " c0 : float\n", " Parameter trading off the influence of higher-order versus lower-order\n", " terms in the polynomial. If `c0` = 0, the kernel is said to be\n", " homogenous. Default is 1.\n", " \"\"\"\n", " super().__init__()\n", " self.hyperparameters = {\"id\": \"PolynomialKernel\"}\n", " self.parameters = {\"d\": d, \"c0\": c0, \"gamma\": gamma}\n", "\n", " def _kernel(self, X, Y=None):\n", " \"\"\"\n", " Compute the degree-`d` polynomial kernel between all pairs of rows in `X`\n", " and `Y`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " Collection of `N` input vectors\n", " Y : :py:class:`ndarray ` of shape `(M, C)` or None\n", " Collection of `M` input vectors. If None, assume `Y = X`. Default\n", " is None.\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(N, M)`\n", " Similarity between `X` and `Y` where index (`i`, `j`) gives\n", " :math:`k(x_i, y_j)` (i.e., the kernel's Gram-matrix).\n", " \"\"\"\n", " P = self.parameters\n", " X, Y = kernel_checks(X, Y)\n", " gamma = 1 / X.shape[1] if P[\"gamma\"] is None else P[\"gamma\"]\n", " return (gamma * (X @ Y.T) + P[\"c0\"]) ** P[\"d\"]\n", "\n", "\n", "class RBFKernel(KernelBase):\n", " def __init__(self, sigma=None):\n", " \"\"\"\n", " Radial basis function (RBF) / squared exponential kernel.\n", "\n", " Notes\n", " -----\n", " For input vectors :math:`\\mathbf{x}` and :math:`\\mathbf{y}`, the radial\n", " basis function kernel is:\n", "\n", " .. math::\n", "\n", " k(\\mathbf{x}, \\mathbf{y}) = \\exp \\left\\{ -0.5\n", " \\left\\lVert \\\\frac{\\mathbf{x} -\n", " \\mathbf{y}}{\\sigma} \\\\right\\\\rVert_2^2 \\\\right\\}\n", "\n", " The RBF kernel decreases with distance and ranges between zero (in the\n", " limit) to one (when **x** = **y**). Notably, the implied feature space\n", " of the kernel has an infinite number of dimensions.\n", "\n", " Parameters\n", " ----------\n", " sigma : float or array of shape `(C,)` or None\n", " A scaling parameter for the vectors **x** and **y**, producing an\n", " isotropic kernel if a float, or an anistropic kernel if an array of\n", " length `C`. Larger values result in higher resolution / greater\n", " smoothing. If None, defaults to :math:`\\sqrt(C / 2)`. Sometimes\n", " referred to as the kernel 'bandwidth'. Default is None.\n", " \"\"\"\n", " super().__init__()\n", " self.hyperparameters = {\"id\": \"RBFKernel\"}\n", " self.parameters = {\"sigma\": sigma}\n", "\n", " def _kernel(self, X, Y=None):\n", " \"\"\"\n", " Computes the radial basis function (RBF) kernel between all pairs of\n", " rows in `X` and `Y`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " Collection of `N` input vectors, each with dimension `C`.\n", " Y : :py:class:`ndarray ` of shape `(M, C)`\n", " Collection of `M` input vectors. If None, assume `Y` = `X`. Default\n", " is None.\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(N, M)`\n", " Similarity between `X` and `Y` where index (i, j) gives :math:`k(x_i, y_j)`.\n", " \"\"\"\n", " P = self.parameters\n", " X, Y = kernel_checks(X, Y)\n", " sigma = np.sqrt(X.shape[1] / 2) if P[\"sigma\"] is None else P[\"sigma\"]\n", " return np.exp(-0.5 * pairwise_l2_distances(X / sigma, Y / sigma) ** 2)\n", "\n", "\n", "class KernelInitializer(object):\n", " def __init__(self, param=None):\n", " \"\"\"\n", " A class for initializing learning rate schedulers. Valid inputs are:\n", " (a) __str__ representations of `KernelBase` instances\n", " (b) `KernelBase` instances\n", " (c) Parameter dicts (e.g., as produced via the :meth:`summary` method in\n", " `KernelBase` instances)\n", "\n", " If `param` is None, return `LinearKernel`.\n", " \"\"\"\n", " self.param = param\n", "\n", " def __call__(self):\n", " param = self.param\n", " if param is None:\n", " kernel = LinearKernel()\n", " elif isinstance(param, KernelBase):\n", " kernel = param\n", " elif isinstance(param, str):\n", " kernel = self.init_from_str()\n", " elif isinstance(param, dict):\n", " kernel = self.init_from_dict()\n", " return kernel\n", "\n", " def init_from_str(self):\n", " r = r\"([a-zA-Z0-9]*)=([^,)]*)\"\n", " kr_str = self.param.lower()\n", " kwargs = dict([(i, eval(j)) for (i, j) in re.findall(r, self.param)])\n", "\n", " if \"linear\" in kr_str:\n", " kernel = LinearKernel(**kwargs)\n", " elif \"polynomial\" in kr_str:\n", " kernel = PolynomialKernel(**kwargs)\n", " elif \"rbf\" in kr_str:\n", " kernel = RBFKernel(**kwargs)\n", " else:\n", " raise NotImplementedError(\"{}\".format(kr_str))\n", " return kernel\n", "\n", " def init_from_dict(self):\n", " S = self.param\n", " sc = S[\"hyperparameters\"] if \"hyperparameters\" in S else None\n", "\n", " if sc is None:\n", " raise ValueError(\"Must have `hyperparameters` key: {}\".format(S))\n", "\n", " if sc and sc[\"id\"] == \"LinearKernel\":\n", " scheduler = LinearKernel().set_params(S)\n", " elif sc and sc[\"id\"] == \"PolynomialKernel\":\n", " scheduler = PolynomialKernel().set_params(S)\n", " elif sc and sc[\"id\"] == \"RBFKernel\":\n", " scheduler = RBFKernel().set_params(S)\n", " elif sc:\n", " raise NotImplementedError(\"{}\".format(sc[\"id\"]))\n", " return scheduler\n", "\n", "\n", "def kernel_checks(X, Y):\n", " X = X.reshape(-1, 1) if X.ndim == 1 else X\n", " Y = X if Y is None else Y\n", " Y = Y.reshape(-1, 1) if Y.ndim == 1 else Y\n", "\n", " assert X.ndim == 2, \"X must have 2 dimensions, but got {}\".format(X.ndim)\n", " assert Y.ndim == 2, \"Y must have 2 dimensions, but got {}\".format(Y.ndim)\n", " assert X.shape[1] == Y.shape[1], \"X and Y must have the same number of columns\"\n", " return X, Y\n", "\n", "\n", "def pairwise_l2_distances(X, Y):\n", " \"\"\"\n", " A fast, vectorized way to compute pairwise l2 distances between rows in `X`\n", " and `Y`.\n", "\n", " Notes\n", " -----\n", " An entry of the pairwise Euclidean distance matrix for two vectors is\n", "\n", " .. math::\n", "\n", " d[i, j] &= \\sqrt{(x_i - y_i) @ (x_i - y_i)} \\\\\\\\\n", " &= \\sqrt{sum (x_i - y_j)^2} \\\\\\\\\n", " &= \\sqrt{sum (x_i)^2 - 2 x_i y_j + (y_j)^2}\n", "\n", " The code below computes the the third line using numpy broadcasting\n", " fanciness to avoid any for loops.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " Collection of `N` input vectors\n", " Y : :py:class:`ndarray ` of shape `(M, C)`\n", " Collection of `M` input vectors. If None, assume `Y` = `X`. Default is\n", " None.\n", "\n", " Returns\n", " -------\n", " dists : :py:class:`ndarray ` of shape `(N, M)`\n", " Pairwise distance matrix. Entry (i, j) contains the `L2` distance between\n", " :math:`x_i` and :math:`y_j`.\n", " \"\"\"\n", " D = -2 * X @ Y.T + np.sum(Y ** 2, axis=1) + np.sum(X ** 2, axis=1)[:, np.newaxis]\n", " D[D < 0] = 0 # clip any value less than 0 (a result of numerical imprecision)\n", " return np.sqrt(D)\n"]} {"path": "numpy_ml/utils/data_structures.py", "content": ["import heapq\n", "from copy import copy\n", "from collections import Hashable\n", "\n", "import numpy as np\n", "\n", "from .distance_metrics import euclidean\n", "\n", "#######################################################################\n", "# Priority Queue #\n", "#######################################################################\n", "\n", "\n", "class PQNode(object):\n", " def __init__(self, key, val, priority, entry_id, **kwargs):\n", " \"\"\"A generic node object for holding entries in :class:`PriorityQueue`\"\"\"\n", " self.key = key\n", " self.val = val\n", " self.entry_id = entry_id\n", " self.priority = priority\n", "\n", " def __repr__(self):\n", " fstr = \"PQNode(key={}, val={}, priority={}, entry_id={})\"\n", " return fstr.format(self.key, self.val, self.priority, self.entry_id)\n", "\n", " def to_dict(self):\n", " \"\"\"Return a dictionary representation of the node's contents\"\"\"\n", " d = self.__dict__\n", " d[\"id\"] = \"PQNode\"\n", " return d\n", "\n", " def __gt__(self, other):\n", " if not isinstance(other, PQNode):\n", " return -1\n", " if self.priority == other.priority:\n", " return self.entry_id > other.entry_id\n", " return self.priority > other.priority\n", "\n", " def __ge__(self, other):\n", " if not isinstance(other, PQNode):\n", " return -1\n", " return self.priority >= other.priority\n", "\n", " def __lt__(self, other):\n", " if not isinstance(other, PQNode):\n", " return -1\n", " if self.priority == other.priority:\n", " return self.entry_id < other.entry_id\n", " return self.priority < other.priority\n", "\n", " def __le__(self, other):\n", " if not isinstance(other, PQNode):\n", " return -1\n", " return self.priority <= other.priority\n", "\n", "\n", "class PriorityQueue:\n", " def __init__(self, capacity, heap_order=\"max\"):\n", " \"\"\"\n", " A priority queue implementation using a binary heap.\n", "\n", " Notes\n", " -----\n", " A priority queue is a data structure useful for storing the top\n", " `capacity` largest or smallest elements in a collection of values. As a\n", " result of using a binary heap, ``PriorityQueue`` offers `O(log N)`\n", " :meth:`push` and :meth:`pop` operations.\n", "\n", " Parameters\n", " ----------\n", " capacity: int\n", " The maximum number of items that can be held in the queue.\n", " heap_order: {\"max\", \"min\"}\n", " Whether the priority queue should retain the items with the\n", " `capacity` smallest (`heap_order` = 'min') or `capacity` largest\n", " (`heap_order` = 'max') priorities.\n", " \"\"\"\n", " assert heap_order in [\"max\", \"min\"], \"heap_order must be either 'max' or 'min'\"\n", " self.capacity = capacity\n", " self.heap_order = heap_order\n", "\n", " self._pq = []\n", " self._count = 0\n", " self._entry_counter = 0\n", "\n", " def __repr__(self):\n", " fstr = \"PriorityQueue(capacity={}, heap_order={}) with {} items\"\n", " return fstr.format(self.capacity, self.heap_order, self._count)\n", "\n", " def __len__(self):\n", " return self._count\n", "\n", " def __iter__(self):\n", " return iter(self._pq)\n", "\n", " def push(self, key, priority, val=None):\n", " \"\"\"\n", " Add a new (key, value) pair with priority `priority` to the queue.\n", "\n", " Notes\n", " -----\n", " If the queue is at capacity and `priority` exceeds the priority of the\n", " item with the largest/smallest priority currently in the queue, replace\n", " the current queue item with (`key`, `val`).\n", "\n", " Parameters\n", " ----------\n", " key : hashable object\n", " The key to insert into the queue.\n", " priority : comparable\n", " The priority for the `key`, `val` pair.\n", " val : object\n", " The value associated with `key`. Default is None.\n", " \"\"\"\n", " if self.heap_order == \"max\":\n", " priority = -1 * priority\n", "\n", " item = PQNode(key=key, val=val, priority=priority, entry_id=self._entry_counter)\n", " heapq.heappush(self._pq, item)\n", "\n", " self._count += 1\n", " self._entry_counter += 1\n", "\n", " while self._count > self.capacity:\n", " self.pop()\n", "\n", " def pop(self):\n", " \"\"\"\n", " Remove the item with the largest/smallest (depending on\n", " ``self.heap_order``) priority from the queue and return it.\n", "\n", " Notes\n", " -----\n", " In contrast to :meth:`peek`, this operation is `O(log N)`.\n", "\n", " Returns\n", " -------\n", " item : :class:`PQNode` instance or None\n", " Item with the largest/smallest priority, depending on\n", " ``self.heap_order``.\n", " \"\"\"\n", " item = heapq.heappop(self._pq).to_dict()\n", " if self.heap_order == \"max\":\n", " item[\"priority\"] = -1 * item[\"priority\"]\n", " self._count -= 1\n", " return item\n", "\n", " def peek(self):\n", " \"\"\"\n", " Return the item with the largest/smallest (depending on\n", " ``self.heap_order``) priority *without* removing it from the queue.\n", "\n", " Notes\n", " -----\n", " In contrast to :meth:`pop`, this operation is O(1).\n", "\n", " Returns\n", " -------\n", " item : :class:`PQNode` instance or None\n", " Item with the largest/smallest priority, depending on\n", " ``self.heap_order``.\n", " \"\"\"\n", " item = None\n", " if self._count > 0:\n", " item = copy(self._pq[0].to_dict())\n", " if self.heap_order == \"max\":\n", " item[\"priority\"] = -1 * item[\"priority\"]\n", " return item\n", "\n", "\n", "#######################################################################\n", "# Ball Tree #\n", "#######################################################################\n", "\n", "\n", "class BallTreeNode:\n", " def __init__(self, centroid=None, X=None, y=None):\n", " self.left = None\n", " self.right = None\n", " self.radius = None\n", " self.is_leaf = False\n", "\n", " self.data = X\n", " self.targets = y\n", " self.centroid = centroid\n", "\n", " def __repr__(self):\n", " fstr = \"BallTreeNode(centroid={}, is_leaf={})\"\n", " return fstr.format(self.centroid, self.is_leaf)\n", "\n", " def to_dict(self):\n", " d = self.__dict__\n", " d[\"id\"] = \"BallTreeNode\"\n", " return d\n", "\n", "\n", "class BallTree:\n", " def __init__(self, leaf_size=40, metric=None):\n", " \"\"\"\n", " A ball tree data structure.\n", "\n", " Notes\n", " -----\n", " A ball tree is a binary tree in which every node defines a\n", " `D`-dimensional hypersphere (\"ball\") containing a subset of the points\n", " to be searched. Each internal node of the tree partitions the data\n", " points into two disjoint sets which are associated with different\n", " balls. While the balls themselves may intersect, each point is assigned\n", " to one or the other ball in the partition according to its distance\n", " from the ball's center. Each leaf node in the tree defines a ball and\n", " enumerates all data points inside that ball.\n", "\n", " Parameters\n", " ----------\n", " leaf_size : int\n", " The maximum number of datapoints at each leaf. Default is 40.\n", " metric : :doc:`Distance metric ` or None\n", " The distance metric to use for computing nearest neighbors. If\n", " None, use the :func:`~numpy_ml.utils.distance_metrics.euclidean`\n", " metric. Default is None.\n", "\n", " References\n", " ----------\n", " .. [1] Omohundro, S. M. (1989). \"Five balltree construction algorithms\". *ICSI\n", " Technical Report TR-89-063*.\n", " .. [2] Liu, T., Moore, A., & Gray A. (2006). \"New algorithms for efficient\n", " high-dimensional nonparametric classification\". *J. Mach. Learn. Res.,\n", " 7*, 1135-1158.\n", " \"\"\"\n", " self.root = None\n", " self.leaf_size = leaf_size\n", " self.metric = metric if metric is not None else euclidean\n", "\n", " def fit(self, X, y=None):\n", " \"\"\"\n", " Build a ball tree recursively using the O(M log N) `k`-d construction\n", " algorithm.\n", "\n", " Notes\n", " -----\n", " Recursively divides data into nodes defined by a centroid `C` and radius\n", " `r` such that each point below the node lies within the hyper-sphere\n", " defined by `C` and `r`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " An array of `N` examples each with `M` features.\n", " y : :py:class:`ndarray ` of shape `(N, \\*)` or None\n", " An array of target values / labels associated with the entries in\n", " `X`. Default is None.\n", " \"\"\"\n", " centroid, left_X, left_y, right_X, right_y = self._split(X, y)\n", " self.root = BallTreeNode(centroid=centroid)\n", " self.root.radius = np.max([self.metric(centroid, x) for x in X])\n", " self.root.left = self._build_tree(left_X, left_y)\n", " self.root.right = self._build_tree(right_X, right_y)\n", "\n", " def _build_tree(self, X, y):\n", " centroid, left_X, left_y, right_X, right_y = self._split(X, y)\n", "\n", " if X.shape[0] <= self.leaf_size:\n", " leaf = BallTreeNode(centroid=centroid, X=X, y=y)\n", " leaf.radius = np.max([self.metric(centroid, x) for x in X])\n", " leaf.is_leaf = True\n", " return leaf\n", "\n", " node = BallTreeNode(centroid=centroid)\n", " node.radius = np.max([self.metric(centroid, x) for x in X])\n", " node.left = self._build_tree(left_X, left_y)\n", " node.right = self._build_tree(right_X, right_y)\n", " return node\n", "\n", " def _split(self, X, y=None):\n", " # find the dimension with greatest variance\n", " split_dim = np.argmax(np.var(X, axis=0))\n", "\n", " # sort X and y along split_dim\n", " sort_ixs = np.argsort(X[:, split_dim])\n", " X, y = X[sort_ixs], y[sort_ixs] if y is not None else None\n", "\n", " # divide at median value of split_dim\n", " med_ix = X.shape[0] // 2\n", " centroid = X[med_ix] # , split_dim\n", "\n", " # split data into two halves at the centroid (median always appears on\n", " # the right split)\n", " left_X, left_y = X[:med_ix], y[:med_ix] if y is not None else None\n", " right_X, right_y = X[med_ix:], y[med_ix:] if y is not None else None\n", " return centroid, left_X, left_y, right_X, right_y\n", "\n", " def nearest_neighbors(self, k, x):\n", " \"\"\"\n", " Find the `k` nearest neighbors in the ball tree to a query vector `x`\n", " using the KNS1 algorithm.\n", "\n", " Parameters\n", " ----------\n", " k : int\n", " The number of closest points in `X` to return\n", " x : :py:class:`ndarray ` of shape `(1, M)`\n", " The query vector.\n", "\n", " Returns\n", " -------\n", " nearest : list of :class:`PQNode` s of length `k`\n", " List of the `k` points in `X` to closest to the query vector. The\n", " ``key`` attribute of each :class:`PQNode` contains the point itself, the\n", " ``val`` attribute contains its target, and the ``distance``\n", " attribute contains its distance to the query vector.\n", " \"\"\"\n", " # maintain a max-first priority queue with priority = distance to x\n", " PQ = PriorityQueue(capacity=k, heap_order=\"max\")\n", " nearest = self._knn(k, x, PQ, self.root)\n", " for n in nearest:\n", " n.distance = self.metric(x, n.key)\n", " return nearest\n", "\n", " def _knn(self, k, x, PQ, root):\n", " dist = self.metric\n", " dist_to_ball = dist(x, root.centroid) - root.radius\n", " dist_to_farthest_neighbor = dist(x, PQ.peek()[\"key\"]) if len(PQ) > 0 else np.inf\n", "\n", " if dist_to_ball >= dist_to_farthest_neighbor and len(PQ) == k:\n", " return PQ\n", " if root.is_leaf:\n", " targets = [None] * len(root.data) if root.targets is None else root.targets\n", " for point, target in zip(root.data, targets):\n", " dist_to_x = dist(x, point)\n", " if len(PQ) == k and dist_to_x < dist_to_farthest_neighbor:\n", " PQ.push(key=point, val=target, priority=dist_to_x)\n", " else:\n", " PQ.push(key=point, val=target, priority=dist_to_x)\n", " else:\n", " l_closest = dist(x, root.left.centroid) < dist(x, root.right.centroid)\n", " PQ = self._knn(k, x, PQ, root.left if l_closest else root.right)\n", " PQ = self._knn(k, x, PQ, root.right if l_closest else root.left)\n", " return PQ\n", "\n", "\n", "#######################################################################\n", "# Multinomial Sampler #\n", "#######################################################################\n", "\n", "\n", "class DiscreteSampler:\n", " def __init__(self, probs, log=False, with_replacement=True):\n", " \"\"\"\n", " Sample from an arbitrary multinomial PMF over the first `N` nonnegative\n", " integers using Vose's algorithm for the alias method.\n", "\n", " Notes\n", " -----\n", " Vose's algorithm takes `O(n)` time to initialize, requires `O(n)` memory,\n", " and generates samples in constant time.\n", "\n", " References\n", " ----------\n", " .. [1] Walker, A. J. (1977) \"An efficient method for generating discrete\n", " random variables with general distributions\". *ACM Transactions on\n", " Mathematical Software, 3(3)*, 253-256.\n", "\n", " .. [2] Vose, M. D. (1991) \"A linear algorithm for generating random numbers\n", " with a given distribution\". *IEEE Trans. Softw. Eng., 9*, 972-974.\n", "\n", " .. [3] Schwarz, K (2011) \"Darts, dice, and coins: sampling from a discrete\n", " distribution\". http://www.keithschwarz.com/darts-dice-coins/\n", "\n", " Parameters\n", " ----------\n", " probs : :py:class:`ndarray ` of length `(N,)`\n", " A list of probabilities of the `N` outcomes in the sample space.\n", " `probs[i]` returns the probability of outcome `i`.\n", " log : bool\n", " Whether the probabilities in `probs` are in logspace. Default is\n", " False.\n", " with_replacement : bool\n", " Whether to generate samples with or without replacement. Default is\n", " True.\n", " \"\"\"\n", " if not isinstance(probs, np.ndarray):\n", " probs = np.array(probs)\n", "\n", " self.log = log\n", " self.N = len(probs)\n", " self.probs = probs\n", " self.with_replacement = with_replacement\n", "\n", " alias = np.zeros(self.N)\n", " prob = np.zeros(self.N)\n", " scaled_probs = self.probs + np.log(self.N) if log else self.probs * self.N\n", "\n", " selector = scaled_probs < 0 if log else scaled_probs < 1\n", " small, large = np.where(selector)[0].tolist(), np.where(~selector)[0].tolist()\n", "\n", " while len(small) and len(large):\n", " l, g = small.pop(), large.pop()\n", "\n", " alias[l] = g\n", " prob[l] = scaled_probs[l]\n", "\n", " if log:\n", " pg = np.log(np.exp(scaled_probs[g]) + np.exp(scaled_probs[l]) - 1)\n", " else:\n", " pg = scaled_probs[g] + scaled_probs[l] - 1\n", "\n", " scaled_probs[g] = pg\n", " to_small = pg < 0 if log else pg < 1\n", " if to_small:\n", " small.append(g)\n", " else:\n", " large.append(g)\n", "\n", " while len(large):\n", " prob[large.pop()] = 0 if log else 1\n", "\n", " while len(small):\n", " prob[small.pop()] = 0 if log else 1\n", "\n", " self.prob_table = prob\n", " self.alias_table = alias\n", "\n", " def __call__(self, n_samples=1):\n", " \"\"\"\n", " Generate random draws from the `probs` distribution over integers in\n", " [0, N).\n", "\n", " Parameters\n", " ----------\n", " n_samples: int\n", " The number of samples to generate. Default is 1.\n", "\n", " Returns\n", " -------\n", " sample : :py:class:`ndarray ` of shape `(n_samples,)`\n", " A collection of draws from the distribution defined by `probs`.\n", " Each sample is an int in the range `[0, N)`.\n", " \"\"\"\n", " return self.sample(n_samples)\n", "\n", " def sample(self, n_samples=1):\n", " \"\"\"\n", " Generate random draws from the `probs` distribution over integers in\n", " [0, N).\n", "\n", " Parameters\n", " ----------\n", " n_samples: int\n", " The number of samples to generate. Default is 1.\n", "\n", " Returns\n", " -------\n", " sample : :py:class:`ndarray ` of shape `(n_samples,)`\n", " A collection of draws from the distribution defined by `probs`.\n", " Each sample is an int in the range `[0, N)`.\n", " \"\"\"\n", " ixs = np.random.randint(0, self.N, n_samples)\n", " p = np.exp(self.prob_table[ixs]) if self.log else self.prob_table[ixs]\n", " flips = np.random.binomial(1, p)\n", " samples = [ix if f else self.alias_table[ix] for ix, f in zip(ixs, flips)]\n", "\n", " # do recursive rejection sampling to sample without replacement\n", " if not self.with_replacement:\n", " unique = list(set(samples))\n", " while len(samples) != len(unique):\n", " n_new = len(samples) - len(unique)\n", " samples = unique + self.sample(n_new).tolist()\n", " unique = list(set(samples))\n", "\n", " return np.array(samples, dtype=int)\n", "\n", "\n", "#######################################################################\n", "# Dict #\n", "#######################################################################\n", "\n", "\n", "class Dict(dict):\n", " def __init__(self, encoder=None):\n", " \"\"\"\n", " A dictionary subclass which returns the key value if it is not in the\n", " dict.\n", "\n", " Parameters\n", " ----------\n", " encoder : function or None\n", " A function which is applied to a key before adding / retrieving it\n", " from the dictionary. If None, the function defaults to the\n", " identity. Default is None.\n", " \"\"\"\n", " super(Dict, self).__init__()\n", " self._encoder = encoder\n", " self._id_max = 0\n", "\n", " def __setitem__(self, key, value):\n", " if self._encoder is not None:\n", " key = self._encoder(key)\n", " elif not isinstance(key, Hashable):\n", " key = tuple(key)\n", " super(Dict, self).__setitem__(key, value)\n", "\n", " def _encode_key(self, key):\n", " D = super(Dict, self)\n", " enc_key = self._encoder(key)\n", " if D.__contains__(enc_key):\n", " val = D.__getitem__(enc_key)\n", " else:\n", " val = self._id_max\n", " D.__setitem__(enc_key, val)\n", " self._id_max += 1\n", " return val\n", "\n", " def __getitem__(self, key):\n", " self._key = copy.deepcopy(key)\n", " if self._encoder is not None:\n", " return self._encode_key(key)\n", " elif not isinstance(key, Hashable):\n", " key = tuple(key)\n", " return super(Dict, self).__getitem__(key)\n", "\n", " def __missing__(self, key):\n", " return self._key\n"]} {"path": "numpy_ml/utils/windows.py", "content": ["import numpy as np\n", "\n", "\n", "def blackman_harris(window_len, symmetric=False):\n", " \"\"\"\n", " The Blackman-Harris window.\n", "\n", " Notes\n", " -----\n", " The Blackman-Harris window is an instance of the more general class of\n", " cosine-sum windows where `K=3`. Additional coefficients extend the Hamming\n", " window to further minimize the magnitude of the nearest side-lobe in the\n", " frequency response.\n", "\n", " .. math::\n", " \\\\text{bh}(n) = a_0 - a_1 \\cos\\left(\\\\frac{2 \\pi n}{N}\\\\right) +\n", " a_2 \\cos\\left(\\\\frac{4 \\pi n }{N}\\\\right) -\n", " a_3 \\cos\\left(\\\\frac{6 \\pi n}{N}\\\\right)\n", "\n", " where `N` = `window_len` - 1, :math:`a_0` = 0.35875, :math:`a_1` = 0.48829,\n", " :math:`a_2` = 0.14128, and :math:`a_3` = 0.01168.\n", "\n", " Parameters\n", " ----------\n", " window_len : int\n", " The length of the window in samples. Should be equal to the\n", " `frame_width` if applying to a windowed signal.\n", " symmetric : bool\n", " If False, create a 'periodic' window that can be used in with an FFT /\n", " in spectral analysis. If True, generate a symmetric window that can be\n", " used in, e.g., filter design. Default is False.\n", "\n", " Returns\n", " -------\n", " window : :py:class:`ndarray ` of shape `(window_len,)`\n", " The window\n", " \"\"\"\n", " return generalized_cosine(\n", " window_len, [0.35875, 0.48829, 0.14128, 0.01168], symmetric\n", " )\n", "\n", "\n", "def hamming(window_len, symmetric=False):\n", " \"\"\"\n", " The Hamming window.\n", "\n", " Notes\n", " -----\n", " The Hamming window is an instance of the more general class of cosine-sum\n", " windows where `K=1` and :math:`a_0 = 0.54`. Coefficients selected to\n", " minimize the magnitude of the nearest side-lobe in the frequency response.\n", "\n", " .. math::\n", "\n", " \\\\text{hamming}(n) = 0.54 -\n", " 0.46 \\cos\\left(\\\\frac{2 \\pi n}{\\\\text{window_len} - 1}\\\\right)\n", "\n", " Parameters\n", " ----------\n", " window_len : int\n", " The length of the window in samples. Should be equal to the\n", " `frame_width` if applying to a windowed signal.\n", " symmetric : bool\n", " If False, create a 'periodic' window that can be used in with an FFT /\n", " in spectral analysis. If True, generate a symmetric window that can be\n", " used in, e.g., filter design. Default is False.\n", "\n", " Returns\n", " -------\n", " window : :py:class:`ndarray ` of shape `(window_len,)`\n", " The window\n", " \"\"\"\n", " return generalized_cosine(window_len, [0.54, 1 - 0.54], symmetric)\n", "\n", "\n", "def hann(window_len, symmetric=False):\n", " \"\"\"\n", " The Hann window.\n", "\n", " Notes\n", " -----\n", " The Hann window is an instance of the more general class of cosine-sum\n", " windows where `K=1` and :math:`a_0` = 0.5. Unlike the Hamming window, the\n", " end points of the Hann window touch zero.\n", "\n", " .. math::\n", "\n", " \\\\text{hann}(n) = 0.5 - 0.5 \\cos\\left(\\\\frac{2 \\pi n}{\\\\text{window_len} - 1}\\\\right)\n", "\n", " Parameters\n", " ----------\n", " window_len : int\n", " The length of the window in samples. Should be equal to the\n", " `frame_width` if applying to a windowed signal.\n", " symmetric : bool\n", " If False, create a 'periodic' window that can be used in with an FFT /\n", " in spectral analysis. If True, generate a symmetric window that can be\n", " used in, e.g., filter design. Default is False.\n", "\n", " Returns\n", " -------\n", " window : :py:class:`ndarray ` of shape `(window_len,)`\n", " The window\n", " \"\"\"\n", " return generalized_cosine(window_len, [0.5, 0.5], symmetric)\n", "\n", "\n", "def generalized_cosine(window_len, coefs, symmetric=False):\n", " \"\"\"\n", " The generalized cosine family of window functions.\n", "\n", " Notes\n", " -----\n", " The generalized cosine window is a simple weighted sum of cosine terms.\n", "\n", " For :math:`n \\in \\{0, \\ldots, \\\\text{window_len} \\}`:\n", "\n", " .. math::\n", "\n", " \\\\text{GCW}(n) = \\sum_{k=0}^K (-1)^k a_k \\cos\\left(\\\\frac{2 \\pi k n}{\\\\text{window_len}}\\\\right)\n", "\n", " Parameters\n", " ----------\n", " window_len : int\n", " The length of the window in samples. Should be equal to the\n", " `frame_width` if applying to a windowed signal.\n", " coefs: list of floats\n", " The :math:`a_k` coefficient values\n", " symmetric : bool\n", " If False, create a 'periodic' window that can be used in with an FFT /\n", " in spectral analysis. If True, generate a symmetric window that can be\n", " used in, e.g., filter design. Default is False.\n", "\n", " Returns\n", " -------\n", " window : :py:class:`ndarray ` of shape `(window_len,)`\n", " The window\n", " \"\"\"\n", " window_len += 1 if not symmetric else 0\n", " entries = np.linspace(-np.pi, np.pi, window_len) # (-1)^k * 2pi*n / window_len\n", " window = np.sum([ak * np.cos(k * entries) for k, ak in enumerate(coefs)], axis=0)\n", " return window[:-1] if not symmetric else window\n", "\n", "\n", "class WindowInitializer:\n", " def __call__(self, window):\n", " if window == \"hamming\":\n", " return hamming\n", " elif window == \"blackman_harris\":\n", " return blackman_harris\n", " elif window == \"hann\":\n", " return hann\n", " elif window == \"generalized_cosine\":\n", " return generalized_cosine\n", " else:\n", " raise NotImplementedError(\"{}\".format(window))\n"]} {"path": "numpy_ml/utils/graphs.py", "content": ["from abc import ABC, abstractmethod\n", "from collections import defaultdict\n", "from itertools import combinations, permutations\n", "\n", "import numpy as np\n", "\n", "#######################################################################\n", "# Graph Components #\n", "#######################################################################\n", "\n", "\n", "class Edge(object):\n", " def __init__(self, fr, to, w=None):\n", " \"\"\"\n", " A generic directed edge object.\n", "\n", " Parameters\n", " ----------\n", " fr: int\n", " The id of the vertex the edge goes from\n", " to: int\n", " The id of the vertex the edge goes to\n", " w: float, :class:`Object` instance, or None\n", " The edge weight, if applicable. If weight is an arbitrary Object it\n", " must have a method called 'sample' which takes no arguments and\n", " returns a random sample from the weight distribution. If `w` is\n", " None, no weight is assumed. Default is None.\n", " \"\"\"\n", " self.fr = fr\n", " self.to = to\n", " self._w = w\n", "\n", " def __repr__(self):\n", " return \"{} -> {}, weight: {}\".format(self.fr, self.to, self._w)\n", "\n", " @property\n", " def weight(self):\n", " return self._w.sample() if hasattr(self._w, \"sample\") else self._w\n", "\n", " def reverse(self):\n", " \"\"\"Reverse the edge direction\"\"\"\n", " return Edge(self.t, self.f, self.w)\n", "\n", "\n", "#######################################################################\n", "# Graph Types #\n", "#######################################################################\n", "\n", "\n", "class Graph(ABC):\n", " def __init__(self, V, E):\n", " self._I2V = {i: v for i, v in zip(range(len(V)), V)}\n", " self._V2I = {v: i for i, v in zip(range(len(V)), V)}\n", " self._G = {i: set() for i in range(len(V))}\n", " self._V = V\n", " self._E = E\n", "\n", " self._build_adjacency_list()\n", "\n", " def __getitem__(self, v_i):\n", " return self.get_neighbors(v_i)\n", "\n", " def get_index(self, v):\n", " \"\"\"Get the internal index for a given vetex\"\"\"\n", " return self._V2I[v]\n", "\n", " def get_vertex(self, v_i):\n", " \"\"\"Get the original vertex from a given internal index\"\"\"\n", " return self._I2V[v_i]\n", "\n", " @property\n", " def vertices(self):\n", " return self._V\n", "\n", " @property\n", " def indices(self):\n", " return list(range(len(self.vertices)))\n", "\n", " @property\n", " def edges(self):\n", " return self._E\n", "\n", " def get_neighbors(self, v_i):\n", " \"\"\"\n", " Return the internal indices of the vertices reachable from the vertex\n", " with index `v_i`.\n", " \"\"\"\n", " return [self._V2I[e.to] for e in self._G[v_i]]\n", "\n", " def to_matrix(self):\n", " \"\"\"Return an adjacency matrix representation of the graph\"\"\"\n", " adj_mat = np.zeros((len(self._V), len(self._V)))\n", " for e in self.edges:\n", " fr, to = self._V2I[e.fr], self._V2I[e.to]\n", " adj_mat[fr, to] = 1 if e.weight is None else e.weight\n", " return adj_mat\n", "\n", " def to_adj_dict(self):\n", " \"\"\"Return an adjacency dictionary representation of the graph\"\"\"\n", " adj_dict = defaultdict(lambda: list())\n", " for e in self.edges:\n", " adj_dict[e.fr].append(e)\n", " return adj_dict\n", "\n", " def path_exists(self, s_i, e_i):\n", " \"\"\"\n", " Check whether a path exists from vertex index `s_i` to `e_i`.\n", "\n", " Parameters\n", " ----------\n", " s_i: Int\n", " The interal index of the start vertex\n", " e_i: Int\n", " The internal index of the end vertex\n", "\n", " Returns\n", " -------\n", " path_exists : Boolean\n", " Whether or not a valid path exists between `s_i` and `e_i`.\n", " \"\"\"\n", " queue = [(s_i, [s_i])]\n", " while len(queue):\n", " c_i, path = queue.pop(0)\n", " nbrs_not_on_path = set(self.get_neighbors(c_i)) - set(path)\n", "\n", " for n_i in nbrs_not_on_path:\n", " queue.append((n_i, path + [n_i]))\n", " if n_i == e_i:\n", " return True\n", " return False\n", "\n", " def all_paths(self, s_i, e_i):\n", " \"\"\"\n", " Find all simple paths between `s_i` and `e_i` in the graph.\n", "\n", " Notes\n", " -----\n", " Uses breadth-first search. Ignores all paths with repeated vertices.\n", "\n", " Parameters\n", " ----------\n", " s_i: Int\n", " The interal index of the start vertex\n", " e_i: Int\n", " The internal index of the end vertex\n", "\n", " Returns\n", " -------\n", " complete_paths : list of lists\n", " A list of all paths from `s_i` to `e_i`. Each path is represented\n", " as a list of interal vertex indices.\n", " \"\"\"\n", " complete_paths = []\n", " queue = [(s_i, [s_i])]\n", "\n", " while len(queue):\n", " c_i, path = queue.pop(0)\n", " nbrs_not_on_path = set(self.get_neighbors(c_i)) - set(path)\n", "\n", " for n_i in nbrs_not_on_path:\n", " if n_i == e_i:\n", " complete_paths.append(path + [n_i])\n", " else:\n", " queue.append((n_i, path + [n_i]))\n", "\n", " return complete_paths\n", "\n", " @abstractmethod\n", " def _build_adjacency_list(self):\n", " pass\n", "\n", "\n", "class DiGraph(Graph):\n", " def __init__(self, V, E):\n", " \"\"\"\n", " A generic directed graph object.\n", "\n", " Parameters\n", " ----------\n", " V : list\n", " A list of vertex IDs.\n", " E : list of :class:`Edge ` objects\n", " A list of directed edges connecting pairs of vertices in ``V``.\n", " \"\"\"\n", " super().__init__(V, E)\n", " self.is_directed = True\n", " self._topological_ordering = []\n", "\n", " def _build_adjacency_list(self):\n", " \"\"\"Encode directed graph as an adjancency list\"\"\"\n", " # assumes no parallel edges\n", " for e in self.edges:\n", " fr_i = self._V2I[e.fr]\n", " self._G[fr_i].add(e)\n", "\n", " def reverse(self):\n", " \"\"\"Reverse the direction of all edges in the graph\"\"\"\n", " return DiGraph(self.vertices, [e.reverse() for e in self.edges])\n", "\n", " def topological_ordering(self):\n", " \"\"\"\n", " Returns a (non-unique) topological sort / linearization of the nodes\n", " IFF the graph is acyclic, otherwise returns None.\n", "\n", " Notes\n", " -----\n", " A topological sort is an ordering on the nodes in `G` such that for every\n", " directed edge :math:`u \\\\rightarrow v` in the graph, `u` appears before\n", " `v` in the ordering. The topological ordering is produced by ordering\n", " the nodes in `G` by their DFS \"last visit time,\" from greatest to\n", " smallest.\n", "\n", " This implementation follows a recursive, DFS-based approach [1]_ which\n", " may break if the graph is very large. For an iterative version, see\n", " Khan's algorithm [2]_ .\n", "\n", " References\n", " ----------\n", " .. [1] Tarjan, R. (1976), Edge-disjoint spanning trees and depth-first\n", " search, *Acta Informatica, 6 (2)*: 171\u2013185.\n", " .. [2] Kahn, A. (1962), Topological sorting of large networks,\n", " *Communications of the ACM, 5 (11)*: 558\u2013562.\n", "\n", " Returns\n", " -------\n", " ordering : list or None\n", " A topoligical ordering of the vertex indices if the graph is a DAG,\n", " otherwise None.\n", " \"\"\"\n", " ordering = []\n", " visited = set()\n", "\n", " def dfs(v_i, path=None):\n", " \"\"\"A simple DFS helper routine\"\"\"\n", " path = set([v_i]) if path is None else path\n", " for nbr_i in self.get_neighbors(v_i):\n", " if nbr_i in path:\n", " return True # cycle detected!\n", " elif nbr_i not in visited:\n", " visited.add(nbr_i)\n", " path.add(nbr_i)\n", " is_cyclic = dfs(nbr_i, path)\n", " if is_cyclic:\n", " return True\n", "\n", " # insert to the beginning of the ordering\n", " ordering.insert(0, v_i)\n", " path -= set([v_i])\n", " return False\n", "\n", " for s_i in self.indices:\n", " if s_i not in visited:\n", " visited.add(s_i)\n", " is_cyclic = dfs(s_i)\n", "\n", " if is_cyclic:\n", " return None\n", "\n", " return ordering\n", "\n", " def is_acyclic(self):\n", " \"\"\"Check whether the graph contains cycles\"\"\"\n", " return self.topological_ordering() is not None\n", "\n", "\n", "class UndirectedGraph(Graph):\n", " def __init__(self, V, E):\n", " \"\"\"\n", " A generic undirected graph object.\n", "\n", " Parameters\n", " ----------\n", " V : list\n", " A list of vertex IDs.\n", " E : list of :class:`Edge ` objects\n", " A list of edges connecting pairs of vertices in ``V``. For any edge\n", " connecting vertex `u` to vertex `v`, :class:`UndirectedGraph\n", " ` will assume that there\n", " exists a corresponding edge connecting `v` to `u`, even if this is\n", " not present in `E`.\n", " \"\"\"\n", " super().__init__(V, E)\n", " self.is_directed = False\n", "\n", " def _build_adjacency_list(self):\n", " \"\"\"Encode undirected, unweighted graph as an adjancency list\"\"\"\n", " # assumes no parallel edges\n", " # each edge appears twice as (u,v) and (v,u)\n", " for e in self.edges:\n", " fr_i = self._V2I[e.fr]\n", " to_i = self._V2I[e.to]\n", "\n", " self._G[fr_i].add(e)\n", " self._G[to_i].add(e.reverse())\n", "\n", "\n", "#######################################################################\n", "# Graph Generators #\n", "#######################################################################\n", "\n", "\n", "def random_unweighted_graph(n_vertices, edge_prob=0.5, directed=False):\n", " \"\"\"\n", " Generate an unweighted Erd\u0151s-R\u00e9nyi random graph [*]_.\n", "\n", " References\n", " ----------\n", " .. [*] Erd\u0151s, P. and R\u00e9nyi, A. (1959). On Random Graphs, *Publ. Math. 6*, 290.\n", "\n", " Parameters\n", " ----------\n", " n_vertices : int\n", " The number of vertices in the graph.\n", " edge_prob : float in [0, 1]\n", " The probability of forming an edge between two vertices. Default is\n", " 0.5.\n", " directed : bool\n", " Whether the edges in the graph should be directed. Default is False.\n", "\n", " Returns\n", " -------\n", " G : :class:`Graph` instance\n", " The resulting random graph.\n", " \"\"\"\n", " vertices = list(range(n_vertices))\n", " candidates = permutations(vertices, 2) if directed else combinations(vertices, 2)\n", "\n", " edges = []\n", " for (fr, to) in candidates:\n", " if np.random.rand() <= edge_prob:\n", " edges.append(Edge(fr, to))\n", "\n", " return DiGraph(vertices, edges) if directed else UndirectedGraph(vertices, edges)\n", "\n", "\n", "def random_DAG(n_vertices, edge_prob=0.5):\n", " \"\"\"\n", " Create a 'random' unweighted directed acyclic graph by pruning all the\n", " backward connections from a random graph.\n", "\n", " Parameters\n", " ----------\n", " n_vertices : int\n", " The number of vertices in the graph.\n", " edge_prob : float in [0, 1]\n", " The probability of forming an edge between two vertices in the\n", " underlying random graph, before edge pruning. Default is 0.5.\n", "\n", " Returns\n", " -------\n", " G : :class:`Graph` instance\n", " The resulting DAG.\n", " \"\"\"\n", " G = random_unweighted_graph(n_vertices, edge_prob, directed=True)\n", "\n", " # prune edges to remove backwards connections between vertices\n", " G = DiGraph(G.vertices, [e for e in G.edges if e.fr < e.to])\n", "\n", " # if we pruned away all the edges, generate a new graph\n", " while not len(G.edges):\n", " G = random_unweighted_graph(n_vertices, edge_prob, directed=True)\n", " G = DiGraph(G.vertices, [e for e in G.edges if e.fr < e.to])\n", " return G\n"]} {"path": "numpy_ml/utils/testing.py", "content": ["\"\"\"Utilities for writing unit tests\"\"\"\n", "import numbers\n", "import numpy as np\n", "\n", "\n", "#######################################################################\n", "# Assertions #\n", "#######################################################################\n", "\n", "\n", "def is_symmetric(X):\n", " \"\"\"Check that an array `X` is symmetric along its main diagonal\"\"\"\n", " return np.allclose(X, X.T)\n", "\n", "\n", "def is_symmetric_positive_definite(X):\n", " \"\"\"Check that a matrix `X` is a symmetric and positive-definite.\"\"\"\n", " if is_symmetric(X):\n", " try:\n", " # if matrix is symmetric, check whether the Cholesky decomposition\n", " # (defined only for symmetric/Hermitian positive definite matrices)\n", " # exists\n", " np.linalg.cholesky(X)\n", " return True\n", " except np.linalg.LinAlgError:\n", " return False\n", " return False\n", "\n", "\n", "def is_stochastic(X):\n", " \"\"\"True if `X` contains probabilities that sum to 1 along the columns\"\"\"\n", " msg = \"Array should be stochastic along the columns\"\n", " assert len(X[X < 0]) == len(X[X > 1]) == 0, msg\n", " assert np.allclose(np.sum(X, axis=1), np.ones(X.shape[0])), msg\n", " return True\n", "\n", "\n", "def is_number(a):\n", " \"\"\"Check that a value `a` is numeric\"\"\"\n", " return isinstance(a, numbers.Number)\n", "\n", "\n", "def is_one_hot(x):\n", " \"\"\"Return True if array `x` is a binary array with a single 1\"\"\"\n", " msg = \"Matrix should be one-hot binary\"\n", " assert np.array_equal(x, x.astype(bool)), msg\n", " assert np.allclose(np.sum(x, axis=1), np.ones(x.shape[0])), msg\n", " return True\n", "\n", "\n", "def is_binary(x):\n", " \"\"\"Return True if array `x` consists only of binary values\"\"\"\n", " msg = \"Matrix must be binary\"\n", " assert np.array_equal(x, x.astype(bool)), msg\n", " return True\n", "\n", "\n", "#######################################################################\n", "# Data Generators #\n", "#######################################################################\n", "\n", "\n", "def random_one_hot_matrix(n_examples, n_classes):\n", " \"\"\"Create a random one-hot matrix of shape (`n_examples`, `n_classes`)\"\"\"\n", " X = np.eye(n_classes)\n", " X = X[np.random.choice(n_classes, n_examples)]\n", " return X\n", "\n", "\n", "def random_stochastic_matrix(n_examples, n_classes):\n", " \"\"\"Create a random stochastic matrix of shape (`n_examples`, `n_classes`)\"\"\"\n", " X = np.random.rand(n_examples, n_classes)\n", " X /= X.sum(axis=1, keepdims=True)\n", " return X\n", "\n", "\n", "def random_tensor(shape, standardize=False):\n", " \"\"\"\n", " Create a random real-valued tensor of shape `shape`. If `standardize` is\n", " True, ensure each column has mean 0 and std 1.\n", " \"\"\"\n", " offset = np.random.randint(-300, 300, shape)\n", " X = np.random.rand(*shape) + offset\n", "\n", " if standardize:\n", " eps = np.finfo(float).eps\n", " X = (X - X.mean(axis=0)) / (X.std(axis=0) + eps)\n", " return X\n", "\n", "\n", "def random_binary_tensor(shape, sparsity=0.5):\n", " \"\"\"\n", " Create a random binary tensor of shape `shape`. `sparsity` is a value\n", " between 0 and 1 controlling the ratio of 0s to 1s in the output tensor.\n", " \"\"\"\n", " return (np.random.rand(*shape) >= (1 - sparsity)).astype(float)\n", "\n", "\n", "def random_paragraph(n_words, vocab=None):\n", " \"\"\"\n", " Generate a random paragraph consisting of `n_words` words. If `vocab` is\n", " not None, words will be drawn at random from this list. Otherwise, words\n", " will be sampled uniformly from a collection of 26 Latin words.\n", " \"\"\"\n", " if vocab is None:\n", " vocab = [\n", " \"at\",\n", " \"stet\",\n", " \"accusam\",\n", " \"aliquyam\",\n", " \"clita\",\n", " \"lorem\",\n", " \"ipsum\",\n", " \"dolor\",\n", " \"dolore\",\n", " \"dolores\",\n", " \"sit\",\n", " \"amet\",\n", " \"consetetur\",\n", " \"sadipscing\",\n", " \"elitr\",\n", " \"sed\",\n", " \"diam\",\n", " \"nonumy\",\n", " \"eirmod\",\n", " \"duo\",\n", " \"ea\",\n", " \"eos\",\n", " \"erat\",\n", " \"est\",\n", " \"et\",\n", " \"gubergren\",\n", " ]\n", " return [np.random.choice(vocab) for _ in range(n_words)]\n", "\n", "\n", "#######################################################################\n", "# Custom Warnings #\n", "#######################################################################\n", "\n", "\n", "class DependencyWarning(RuntimeWarning):\n", " pass\n"]} {"path": "numpy_ml/ngram/ngram.py", "content": ["\"\"\"A module for different N-gram smoothing models\"\"\"\n", "import textwrap\n", "from abc import ABC, abstractmethod\n", "from collections import Counter\n", "\n", "import numpy as np\n", "\n", "from numpy_ml.linear_models import LinearRegression\n", "from numpy_ml.preprocessing.nlp import tokenize_words, ngrams, strip_punctuation\n", "\n", "\n", "class NGramBase(ABC):\n", " def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):\n", " \"\"\"\n", " A simple word-level N-gram language model.\n", "\n", " Notes\n", " -----\n", " This is not optimized code and will be slow for large corpora. To see\n", " how industry-scale NGram models are handled, see the SRLIM-format:\n", "\n", " http://www.speech.sri.com/projects/srilm/\n", " \"\"\"\n", " self.N = N\n", " self.unk = unk\n", " self.filter_stopwords = filter_stopwords\n", " self.filter_punctuation = filter_punctuation\n", "\n", " self.hyperparameters = {\n", " \"N\": N,\n", " \"unk\": unk,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " }\n", "\n", " super().__init__()\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " \"\"\"\n", " Compile the n-gram counts for the text(s) in `corpus_fp`.\n", "\n", " Notes\n", " -----\n", " After running `train`, the ``self.counts`` attribute will store\n", " dictionaries of the `N`, `N-1`, ..., 1-gram counts.\n", "\n", " Parameters\n", " ----------\n", " corpus_fp : str\n", " The path to a newline-separated text corpus file.\n", " vocab : :class:`~numpy_ml.preprocessing.nlp.Vocabulary` instance or None\n", " If not None, only the words in `vocab` will be used to construct\n", " the language model; all out-of-vocabulary words will either be\n", " mappend to ```` (if ``self.unk = True``) or removed (if\n", " ``self.unk = False``). Default is None.\n", " encoding : str or None\n", " Specifies the text encoding for corpus. Common entries are 'utf-8',\n", " 'utf-8-sig', 'utf-16'. Default is None.\n", " \"\"\"\n", " return self._train(corpus_fp, vocab=vocab, encoding=encoding)\n", "\n", " def _train(self, corpus_fp, vocab=None, encoding=None):\n", " \"\"\"Actual N-gram training logic\"\"\"\n", " H = self.hyperparameters\n", " grams = {N: [] for N in range(1, self.N + 1)}\n", " counts = {N: Counter() for N in range(1, self.N + 1)}\n", " filter_stop, filter_punc = H[\"filter_stopwords\"], H[\"filter_punctuation\"]\n", "\n", " _n_words = 0\n", " tokens = {\"\"}\n", " bol, eol = [\"\"], [\"\"]\n", "\n", " with open(corpus_fp, \"r\", encoding=encoding) as text:\n", " for line in text:\n", " line = strip_punctuation(line) if filter_punc else line\n", " words = tokenize_words(line, filter_stopwords=filter_stop)\n", "\n", " if vocab is not None:\n", " words = vocab.filter(words, H[\"unk\"])\n", "\n", " if len(words) == 0:\n", " continue\n", "\n", " _n_words += len(words)\n", " tokens.update(words)\n", "\n", " # calculate n, n-1, ... 1-grams\n", " for N in range(1, self.N + 1):\n", " words_padded = bol * max(1, N - 1) + words + eol * max(1, N - 1)\n", " grams[N].extend(ngrams(words_padded, N))\n", "\n", " for N in counts.keys():\n", " counts[N].update(grams[N])\n", "\n", " n_words = {N: np.sum(list(counts[N].values())) for N in range(1, self.N + 1)}\n", " n_words[1] = _n_words\n", "\n", " n_tokens = {N: len(counts[N]) for N in range(2, self.N + 1)}\n", " n_tokens[1] = len(vocab) if vocab is not None else len(tokens)\n", "\n", " self.counts = counts\n", " self.n_words = n_words\n", " self.n_tokens = n_tokens\n", "\n", " def completions(self, words, N):\n", " \"\"\"\n", " Return the distribution over proposed next words under the `N`-gram\n", " language model.\n", "\n", " Parameters\n", " ----------\n", " words : list or tuple of strings\n", " The initial sequence of words\n", " N : int\n", " The gram-size of the language model to use to generate completions\n", "\n", " Returns\n", " -------\n", " probs : list of (word, log_prob) tuples\n", " The list of possible next words and their log probabilities under\n", " the `N`-gram language model (unsorted)\n", " \"\"\"\n", " N = min(N, len(words) + 1)\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", " assert len(words) >= N - 1, \"`words` must have at least {} words\".format(N - 1)\n", "\n", " probs = []\n", " base = tuple(w.lower() for w in words[-N + 1 :])\n", " for k in self.counts[N].keys():\n", " if k[:-1] == base:\n", " c_prob = self._log_ngram_prob(base + k[-1:])\n", " probs.append((k[-1], c_prob))\n", " return probs\n", "\n", " def generate(self, N, seed_words=[\"\"], n_sentences=5):\n", " \"\"\"\n", " Use the `N`-gram language model to generate sentences.\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The gram-size of the model to generate from\n", " seed_words : list of strs\n", " A list of seed words to use to condition the initial sentence\n", " generation. Default is ``[\"\"]``.\n", " sentences : int\n", " The number of sentences to generate from the `N`-gram model.\n", " Default is 50.\n", "\n", " Returns\n", " -------\n", " sentences : str\n", " Samples from the `N`-gram model, joined by white spaces, with\n", " individual sentences separated by newlines.\n", " \"\"\"\n", " counter = 0\n", " sentences = []\n", " words = seed_words.copy()\n", " while counter < n_sentences:\n", " nextw, probs = zip(*self.completions(words, N))\n", " probs = np.exp(probs) / np.exp(probs).sum() # renormalize probs if smoothed\n", " next_word = np.random.choice(nextw, p=probs)\n", "\n", " # if we reach the end of a sentence, save it and start a new one\n", " if next_word == \"\":\n", " S = \" \".join([w for w in words if w != \"\"])\n", " S = textwrap.fill(S, 90, initial_indent=\"\", subsequent_indent=\" \")\n", " print(S)\n", " words.append(next_word)\n", " sentences.append(words)\n", " words = seed_words.copy()\n", " counter += 1\n", " continue\n", "\n", " words.append(next_word)\n", " return sentences\n", "\n", " def perplexity(self, words, N):\n", " r\"\"\"\n", " Calculate the model perplexity on a sequence of words.\n", "\n", " Notes\n", " -----\n", " Perplexity, `PP`, is defined as\n", "\n", " .. math::\n", "\n", " PP(W) = \\left( \\frac{1}{p(W)} \\right)^{1 / n}\n", "\n", " or simply\n", "\n", " .. math::\n", "\n", " PP(W) &= \\exp(-\\log p(W) / n) \\\\\n", " &= \\exp(H(W))\n", "\n", " where :math:`W = [w_1, \\ldots, w_k]` is a sequence of words, `H(w)` is\n", " the cross-entropy of `W` under the current model, and `n` is the number\n", " of `N`-grams in `W`.\n", "\n", " Minimizing perplexity is equivalent to maximizing the probability of\n", " `words` under the `N`-gram model. It may also be interpreted as the\n", " average branching factor when predicting the next word under the\n", " language model.\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The gram-size of the model to calculate perplexity with.\n", " words : list or tuple of strings\n", " The sequence of words to compute perplexity on.\n", "\n", " Returns\n", " -------\n", " perplexity : float\n", " The model perlexity for the words in `words`.\n", " \"\"\"\n", " return np.exp(self.cross_entropy(words, N))\n", "\n", " def cross_entropy(self, words, N):\n", " r\"\"\"\n", " Calculate the model cross-entropy on a sequence of words against the\n", " empirical distribution of words in a sample.\n", "\n", " Notes\n", " -----\n", " Model cross-entropy, `H`, is defined as\n", "\n", " .. math::\n", "\n", " H(W) = -\\frac{\\log p(W)}{n}\n", "\n", " where :math:`W = [w_1, \\ldots, w_k]` is a sequence of words, and `n` is\n", " the number of `N`-grams in `W`.\n", "\n", " The model cross-entropy is proportional (not equal, since we use base\n", " `e`) to the average number of bits necessary to encode `W` under the\n", " model distribution.\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The gram-size of the model to calculate cross-entropy on.\n", " words : list or tuple of strings\n", " The sequence of words to compute cross-entropy on.\n", "\n", " Returns\n", " -------\n", " H : float\n", " The model cross-entropy for the words in `words`.\n", " \"\"\"\n", " n_ngrams = len(ngrams(words, N))\n", " return -(1 / n_ngrams) * self.log_prob(words, N)\n", "\n", " def _log_prob(self, words, N):\n", " \"\"\"\n", " Calculate the log probability of a sequence of words under the\n", " `N`-gram model\n", " \"\"\"\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", "\n", " if N > len(words):\n", " err = \"Not enough words for a gram-size of {}: {}\".format(N, len(words))\n", " raise ValueError(err)\n", "\n", " total_prob = 0\n", " for ngram in ngrams(words, N):\n", " total_prob += self._log_ngram_prob(ngram)\n", " return total_prob\n", "\n", " def _n_completions(self, words, N):\n", " \"\"\"\n", " Return the number of unique word tokens that could follow the sequence\n", " `words` under the *unsmoothed* `N`-gram language model.\n", " \"\"\"\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", " assert len(words) <= N - 1, \"Need > {} words to use {}-grams\".format(N - 2, N)\n", "\n", " if isinstance(words, list):\n", " words = tuple(words)\n", "\n", " base = words[-N + 1 :]\n", " return len([k[-1] for k in self.counts[N].keys() if k[:-1] == base])\n", "\n", " def _num_grams_with_count(self, C, N):\n", " \"\"\"\n", " Return the number of unique `N`-gram tokens that occur exactly `C`\n", " times\n", " \"\"\"\n", " assert C > 0\n", " assert N in self.counts, \"You do not have counts for {}-grams\".format(N)\n", " # cache count values for future calls\n", " if not hasattr(self, \"_NC\"):\n", " self._NC = {N: {} for N in range(1, self.N + 1)}\n", " if C not in self._NC[N]:\n", " self._NC[N][C] = len([k for k, v in self.counts[N].items() if v == C])\n", " return self._NC[N][C]\n", "\n", " @abstractmethod\n", " def log_prob(self, words, N):\n", " \"\"\"\n", " Compute the log probability of a sequence of words under the\n", " unsmoothed, maximum-likelihood `N`-gram language model.\n", " \"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def _log_ngram_prob(self, ngram):\n", " \"\"\"Return the unsmoothed log probability of the ngram\"\"\"\n", " raise NotImplementedError\n", "\n", "\n", "class MLENGram(NGramBase):\n", " def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):\n", " \"\"\"\n", " A simple, unsmoothed N-gram model.\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The maximum length (in words) of the context-window to use in the\n", " langauge model. Model will compute all n-grams from 1, ..., N.\n", " unk : bool\n", " Whether to include the ```` (unknown) token in the LM. Default\n", " is True.\n", " filter_stopwords : bool\n", " Whether to remove stopwords before training. Default is True.\n", " filter_punctuation : bool\n", " Whether to remove punctuation before training. Default is True.\n", " \"\"\"\n", " super().__init__(N, unk, filter_stopwords, filter_punctuation)\n", "\n", " self.hyperparameters[\"id\"] = \"MLENGram\"\n", "\n", " def log_prob(self, words, N):\n", " \"\"\"\n", " Compute the log probability of a sequence of words under the\n", " unsmoothed, maximum-likelihood `N`-gram language model.\n", "\n", " Parameters\n", " ----------\n", " words : list of strings\n", " A sequence of words\n", " N : int\n", " The gram-size of the language model to use when calculating the log\n", " probabilities of the sequence\n", "\n", " Returns\n", " -------\n", " total_prob : float\n", " The total log-probability of the sequence `words` under the\n", " `N`-gram language model\n", " \"\"\"\n", " return self._log_prob(words, N)\n", "\n", " def _log_ngram_prob(self, ngram):\n", " \"\"\"Return the unsmoothed log probability of the ngram\"\"\"\n", " N = len(ngram)\n", " num = self.counts[N][ngram]\n", " den = self.counts[N - 1][ngram[:-1]] if N > 1 else self.n_words[1]\n", " return np.log(num) - np.log(den) if (den > 0 and num > 0) else -np.inf\n", "\n", "\n", "class AdditiveNGram(NGramBase):\n", " def __init__(\n", " self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True,\n", " ):\n", " \"\"\"\n", " An N-Gram model with smoothed probabilities calculated via additive /\n", " Lidstone smoothing.\n", "\n", " Notes\n", " -----\n", " The resulting estimates correspond to the expected value of the\n", " posterior, `p(ngram_prob | counts)`, when using a symmetric Dirichlet\n", " prior on counts with parameter `K`.\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The maximum length (in words) of the context-window to use in the\n", " langauge model. Model will compute all n-grams from 1, ..., N\n", " K : float\n", " The pseudocount to add to each observation. Larger values allocate\n", " more probability toward unseen events. When `K` = 1, the model is\n", " known as Laplace smoothing. When `K` = 0.5, the model is known as\n", " expected likelihood estimation (ELE) or the Jeffreys-Perks law.\n", " Default is 1.\n", " unk : bool\n", " Whether to include the ```` (unknown) token in the LM. Default\n", " is True.\n", " filter_stopwords : bool\n", " Whether to remove stopwords before training. Default is True.\n", " filter_punctuation : bool\n", " Whether to remove punctuation before training. Default is True.\n", " \"\"\"\n", " super().__init__(N, unk, filter_stopwords, filter_punctuation)\n", "\n", " self.hyperparameters[\"id\"] = \"AdditiveNGram\"\n", " self.hyperparameters[\"K\"] = K\n", "\n", " def log_prob(self, words, N):\n", " r\"\"\"\n", " Compute the smoothed log probability of a sequence of words under the\n", " `N`-gram language model with additive smoothing.\n", "\n", " Notes\n", " -----\n", " For a bigram, additive smoothing amounts to:\n", "\n", " .. math::\n", "\n", " P(w_i \\mid w_{i-1}) = \\frac{A + K}{B + KV}\n", "\n", " where\n", "\n", " .. math::\n", "\n", " A &= \\text{Count}(w_{i-1}, w_i) \\\\\n", " B &= \\sum_j \\text{Count}(w_{i-1}, w_j) \\\\\n", " V &= |\\{ w_j \\ : \\ \\text{Count}(w_{i-1}, w_j) > 0 \\}|\n", "\n", " This is equivalent to pretending we've seen every possible `N`-gram\n", " sequence at least `K` times.\n", "\n", " Additive smoothing can be problematic, as it:\n", " - Treats each predicted word in the same way\n", " - Can assign too much probability mass to unseen `N`-grams\n", "\n", " Parameters\n", " ----------\n", " words : list of strings\n", " A sequence of words.\n", " N : int\n", " The gram-size of the language model to use when calculating the log\n", " probabilities of the sequence.\n", "\n", " Returns\n", " -------\n", " total_prob : float\n", " The total log-probability of the sequence `words` under the\n", " `N`-gram language model.\n", " \"\"\"\n", " return self._log_prob(words, N)\n", "\n", " def _log_ngram_prob(self, ngram):\n", " \"\"\"Return the smoothed log probability of the ngram\"\"\"\n", " N = len(ngram)\n", " K = self.hyperparameters[\"K\"]\n", " counts, n_words, n_tokens = self.counts, self.n_words[1], self.n_tokens[1]\n", "\n", " ctx = ngram[:-1]\n", " num = counts[N][ngram] + K\n", " ctx_count = counts[N - 1][ctx] if N > 1 else n_words\n", " den = ctx_count + K * n_tokens\n", " return np.log(num / den) if den != 0 else -np.inf\n", "\n", "\n", "class GoodTuringNGram(NGramBase):\n", " def __init__(\n", " self, N, conf=1.96, unk=True, filter_stopwords=True, filter_punctuation=True,\n", " ):\n", " \"\"\"\n", " An N-Gram model with smoothed probabilities calculated with the simple\n", " Good-Turing estimator from Gale (2001).\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The maximum length (in words) of the context-window to use in the\n", " langauge model. Model will compute all n-grams from 1, ..., N.\n", " conf: float\n", " The multiplier of the standard deviation of the empirical smoothed\n", " count (the default, 1.96, corresponds to a 95% confidence\n", " interval). Controls how many datapoints are smoothed using the\n", " log-linear model.\n", " unk : bool\n", " Whether to include the ```` (unknown) token in the LM. Default\n", " is True.\n", " filter_stopwords : bool\n", " Whether to remove stopwords before training. Default is True.\n", " filter_punctuation : bool\n", " Whether to remove punctuation before training. Default is True.\n", " \"\"\"\n", " super().__init__(N, unk, filter_stopwords, filter_punctuation)\n", "\n", " self.hyperparameters[\"id\"] = \"GoodTuringNGram\"\n", " self.hyperparameters[\"conf\"] = conf\n", "\n", " def train(self, corpus_fp, vocab=None, encoding=None):\n", " \"\"\"\n", " Compile the n-gram counts for the text(s) in `corpus_fp`. Upon\n", " completion the `self.counts` attribute will store dictionaries of the\n", " `N`, `N-1`, ..., 1-gram counts.\n", "\n", " Parameters\n", " ----------\n", " corpus_fp : str\n", " The path to a newline-separated text corpus file\n", " vocab : :class:`~numpy_ml.preprocessing.nlp.Vocabulary` instance or None.\n", " If not None, only the words in `vocab` will be used to construct\n", " the language model; all out-of-vocabulary words will either be\n", " mappend to ```` (if ``self.unk = True``) or removed (if\n", " ``self.unk = False``). Default is None.\n", " encoding : str or None\n", " Specifies the text encoding for corpus. Common entries are 'utf-8',\n", " 'utf-8-sig', 'utf-16'. Default is None.\n", " \"\"\"\n", " self._train(corpus_fp, vocab=vocab, encoding=encoding)\n", " self._calc_smoothed_counts()\n", "\n", " def log_prob(self, words, N):\n", " r\"\"\"\n", " Compute the smoothed log probability of a sequence of words under the\n", " `N`-gram language model with Good-Turing smoothing.\n", "\n", " Notes\n", " -----\n", " For a bigram, Good-Turing smoothing amounts to:\n", "\n", " .. math::\n", "\n", " P(w_i \\mid w_{i-1}) = \\frac{C^*}{\\text{Count}(w_{i-1})}\n", "\n", " where :math:`C^*` is the Good-Turing smoothed estimate of the bigram\n", " count:\n", "\n", " .. math::\n", "\n", " C^* = \\frac{(c + 1) \\text{NumCounts}(c + 1, 2)}{\\text{NumCounts}(c, 2)}\n", "\n", " where\n", "\n", " .. math::\n", "\n", " c &= \\text{Count}(w_{i-1}, w_i) \\\\\n", " \\text{NumCounts}(r, k) &=\n", " |\\{ k\\text{-gram} : \\text{Count}(k\\text{-gram}) = r \\}|\n", "\n", " In words, the probability of an `N`-gram that occurs `r` times in the\n", " corpus is estimated by dividing up the probability mass occupied by\n", " N-grams that occur `r+1` times.\n", "\n", " For large values of `r`, NumCounts becomes unreliable. In this case, we\n", " compute a smoothed version of NumCounts using a power law function:\n", "\n", " .. math::\n", "\n", " \\log \\text{NumCounts}(r) = b + a \\log r\n", "\n", " Under the Good-Turing estimator, the total probability assigned to\n", " unseen `N`-grams is equal to the relative occurrence of `N`-grams that\n", " appear only once.\n", "\n", " Parameters\n", " ----------\n", " words : list of strings\n", " A sequence of words.\n", " N : int\n", " The gram-size of the language model to use when calculating the log\n", " probabilities of the sequence.\n", "\n", " Returns\n", " -------\n", " total_prob : float\n", " The total log-probability of the sequence `words` under the\n", " `N`-gram language model.\n", " \"\"\"\n", " return self._log_prob(words, N)\n", "\n", " def _calc_smoothed_counts(self):\n", " use_interp = False\n", " counts = self.counts\n", " NC = self._num_grams_with_count\n", " conf = self.hyperparameters[\"conf\"]\n", "\n", " totals = {N: 0 for N in range(1, self.N + 1)}\n", " smooth_counts = {N: {} for N in range(1, self.N + 1)}\n", "\n", " # calculate the probability of all (i.e., unseen) n-grams\n", " self._p0 = {n: NC(1, n) / sum(counts[n].values()) for n in range(1, self.N + 1)}\n", "\n", " # fit log-linear models for predicting smoothed counts in absence of\n", " # real data\n", " self._fit_count_models()\n", "\n", " LM = self._count_models\n", " for N in range(1, self.N + 1):\n", " for C in sorted(set(counts[N].values())):\n", "\n", " # estimate the interpolated count using the log-linear model\n", " c1_lm = np.exp(LM[N].predict(np.c_[np.log(C + 1)])).item()\n", " c0_lm = np.exp(LM[N].predict(np.c_[np.log(C)])).item()\n", " count_interp = ((C + 1) * c1_lm) / c0_lm\n", "\n", " # if we have previously been using the interpolated count, or\n", " # if the number of ocurrences of C+1 is 0, use the interpolated\n", " # count as the smoothed count value C*\n", " c1, c0 = NC(C + 1, N), NC(C, N)\n", " if use_interp or c1 == 0:\n", " use_interp = True\n", " smooth_counts[N][C] = count_interp\n", " totals[N] += c0 * smooth_counts[N][C]\n", " continue\n", "\n", " # estimate the smoothed count C* empirically if the number of\n", " # terms with count C + 1 > 0\n", " count_emp = ((C + 1) * c1) / c0\n", "\n", " # compute the approximate variance of the empirical smoothed\n", " # count C* given C\n", " t = conf * np.sqrt((C + 1) ** 2 * (c1 / c0 ** 2) * (1 + c1 / c0))\n", "\n", " # if the difference between the empirical and interpolated\n", " # smoothed counts is greater than t, the empirical estimate\n", " # tends to be more accurate. otherwise, use interpolated\n", " if np.abs(count_interp - count_emp) > t:\n", " smooth_counts[N][C] = count_emp\n", " totals[N] += c0 * smooth_counts[N][C]\n", " continue\n", "\n", " use_interp = True\n", " smooth_counts[N][C] = count_interp\n", " totals[N] += c0 * smooth_counts[N][C]\n", "\n", " self._smooth_totals = totals\n", " self._smooth_counts = smooth_counts\n", "\n", " def _log_ngram_prob(self, ngram):\n", " \"\"\"Return the smoothed log probability of the ngram\"\"\"\n", " N = len(ngram)\n", " sc, T = self._smooth_counts[N], self._smooth_totals[N]\n", " n_tokens, n_seen = self.n_tokens[N], len(self.counts[N])\n", "\n", " # approx. prob of an out-of-vocab ngram (i.e., a fraction of p0)\n", " n_unseen = max((n_tokens ** N) - n_seen, 1)\n", " prob = np.log(self._p0[N] / n_unseen)\n", "\n", " if ngram in self.counts[N]:\n", " C = self.counts[N][ngram]\n", " prob = np.log(1 - self._p0[N]) + np.log(sc[C]) - np.log(T)\n", " return prob\n", "\n", " def _fit_count_models(self):\n", " \"\"\"\n", " Perform the averaging transform proposed by Church and Gale (1991):\n", " estimate the expected count-of-counts by the *density* of\n", " count-of-count values.\n", " \"\"\"\n", " self._count_models = {}\n", " NC = self._num_grams_with_count\n", " for N in range(1, self.N + 1):\n", " X, Y = [], []\n", " sorted_counts = sorted(set(self.counts[N].values())) # r\n", "\n", " for ix, j in enumerate(sorted_counts):\n", " i = 0 if ix == 0 else sorted_counts[ix - 1]\n", " k = 2 * j - i if ix == len(sorted_counts) - 1 else sorted_counts[ix + 1]\n", " y = 2 * NC(j, N) / (k - i)\n", " X.append(j)\n", " Y.append(y)\n", "\n", " # fit log-linear model: log(counts) ~ log(average_transform(counts))\n", " self._count_models[N] = LinearRegression(fit_intercept=True)\n", " self._count_models[N].fit(np.log(X), np.log(Y))\n", " b, a = self._count_models[N].beta\n", "\n", " if a > -1:\n", " fstr = \"[Warning] Log-log averaging transform has slope > -1 for N={}\"\n", " print(fstr.format(N))\n"]} {"path": "numpy_ml/ngram/__init__.py", "content": ["from .ngram import *\n"]} {"path": "numpy_ml/rl_models/__init__.py", "content": ["from . import rl_utils\n", "from . import agents\n", "from . import trainer\n", "from . import tiles\n"]} {"path": "numpy_ml/rl_models/agents.py", "content": ["\"\"\"Reinforcement learning agents that can be run on OpenAI gym environs\"\"\"\n", "\n", "from abc import ABC, abstractmethod\n", "from collections import defaultdict\n", "\n", "import numpy as np\n", "\n", "from .rl_utils import EnvModel, env_stats, tile_state_space\n", "from ..utils.data_structures import Dict\n", "\n", "\n", "class AgentBase(ABC):\n", " def __init__(self, env):\n", " super().__init__()\n", " self.env = env\n", " self.parameters = {}\n", " self.hyperparameters = {}\n", " self.derived_variables = {}\n", " self.env_info = env_stats(env)\n", "\n", " def _create_2num_dicts(self, obs_encoder=None, act_encoder=None):\n", " E = self.env_info\n", " n_states = np.prod(E[\"n_obs_per_dim\"])\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " # create action -> scalar dictionaries\n", " self._num2action = Dict()\n", " self._action2num = Dict(act_encoder)\n", " if n_actions != np.inf:\n", " self._action2num = {act: i for i, act in enumerate(E[\"action_ids\"])}\n", " self._num2action = {i: act for act, i in self._action2num.items()}\n", "\n", " # create obs -> scalar dictionaries\n", " self._num2obs = Dict()\n", " self._obs2num = Dict(obs_encoder)\n", " if n_states != np.inf:\n", " self._obs2num = {act: i for i, act in enumerate(E[\"obs_ids\"])}\n", " self._num2obs = {i: act for act, i in self._obs2num.items()}\n", "\n", " def flush_history(self):\n", " \"\"\"Clear the episode history\"\"\"\n", " for k, v in self.episode_history.items():\n", " self.episode_history[k] = []\n", "\n", " @abstractmethod\n", " def act(self, obs):\n", " \"\"\"Generate an action given the current observation\"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def greedy_policy(self, **kwargs):\n", " \"\"\"\n", " Take a greedy action.\n", "\n", " Returns\n", " -------\n", " total_reward : float\n", " The total reward on the episode.\n", " n_steps : float\n", " The total number of steps taken on the episode.\n", " \"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def run_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Run the agent on a single episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode\n", " render : bool\n", " Whether to render the episode during training\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode, averaged over the theta samples.\n", " steps : float\n", " The total number of steps taken on the episode, averaged over the\n", " theta samples.\n", " \"\"\"\n", " raise NotImplementedError\n", "\n", " @abstractmethod\n", " def update(self):\n", " r\"\"\"\n", " Update the agent parameters according to the rewards accrued on the\n", " current episode.\n", "\n", " Returns\n", " -------\n", " avg_reward : float\n", " The average reward earned by the best `retain_prcnt` theta samples\n", " on the current episode.\n", " \"\"\"\n", " raise NotImplementedError\n", "\n", "\n", "class CrossEntropyAgent(AgentBase):\n", " def __init__(self, env, n_samples_per_episode=500, retain_prcnt=0.2):\n", " r\"\"\"\n", " A cross-entropy method agent.\n", "\n", " Notes\n", " -----\n", " The cross-entropy method [1]_ [2]_ agent only operates on ``envs`` with\n", " discrete action spaces.\n", "\n", " On each episode the agent generates `n_theta_samples` of the parameters\n", " (:math:`\\theta`) for its behavior policy. The `i`'th sample at\n", " timestep `t` is:\n", "\n", " .. math::\n", "\n", " \\theta_i &= \\{\\mathbf{W}_i^{(t)}, \\mathbf{b}_i^{(t)} \\} \\\\\n", " \\theta_i &\\sim \\mathcal{N}(\\mu^{(t)}, \\Sigma^{(t)})\n", "\n", " Weights (:math:`\\mathbf{W}_i`) and bias (:math:`\\mathbf{b}_i`) are the\n", " parameters of the softmax policy:\n", "\n", " .. math::\n", "\n", " \\mathbf{z}_i &= \\text{obs} \\cdot \\mathbf{W}_i + \\mathbf{b}_i \\\\\n", " p(a_i^{(t + 1)}) &= \\frac{e^{\\mathbf{z}_i}}{\\sum_j e^{z_{ij}}} \\\\\n", " a^{(t + 1)} &= \\arg \\max_j p(a_j^{(t+1)})\n", "\n", " At the end of each episode, the agent takes the top `retain_prcnt`\n", " highest scoring :math:`\\theta` samples and combines them to generate\n", " the mean and variance of the distribution of :math:`\\theta` for the\n", " next episode:\n", "\n", " .. math::\n", "\n", " \\mu^{(t+1)} &= \\text{avg}(\\texttt{best_thetas}^{(t)}) \\\\\n", " \\Sigma^{(t+1)} &= \\text{var}(\\texttt{best_thetas}^{(t)})\n", "\n", " References\n", " ----------\n", " .. [1] Mannor, S., Rubinstein, R., & Gat, Y. (2003). The cross entropy\n", " method for fast policy search. In *Proceedings of the 20th Annual\n", " ICML, 20*.\n", " .. [2] Rubinstein, R. (1997). optimization of computer simulation\n", " models with rare events, *European Journal of Operational Research,\n", " 99*, 89\u2013112.\n", "\n", " Parameters\n", " ----------\n", " env : :meth:`gym.wrappers` or :meth:`gym.envs` instance\n", " The environment to run the agent on.\n", " n_samples_per_episode : int\n", " The number of theta samples to evaluate on each episode. Default is 500.\n", " retain_prcnt: float\n", " The percentage of `n_samples_per_episode` to use when calculating\n", " the parameter update at the end of the episode. Default is 0.2.\n", " \"\"\"\n", " super().__init__(env)\n", "\n", " self.retain_prcnt = retain_prcnt\n", " self.n_samples_per_episode = n_samples_per_episode\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " E = self.env_info\n", " assert not E[\"continuous_actions\"], \"Action space must be discrete\"\n", "\n", " self._create_2num_dicts()\n", " b_len = np.prod(E[\"n_actions_per_dim\"])\n", " W_len = b_len * np.prod(E[\"obs_dim\"])\n", " theta_dim = b_len + W_len\n", "\n", " # init mean and variance for mv gaussian with dimensions theta_dim\n", " theta_mean = np.random.rand(theta_dim)\n", " theta_var = np.ones(theta_dim)\n", "\n", " self.parameters = {\"theta_mean\": theta_mean, \"theta_var\": theta_var}\n", " self.derived_variables = {\n", " \"b_len\": b_len,\n", " \"W_len\": W_len,\n", " \"W_samples\": [],\n", " \"b_samples\": [],\n", " \"episode_num\": 0,\n", " \"cumulative_rewards\": [],\n", " }\n", "\n", " self.hyperparameters = {\n", " \"agent\": \"CrossEntropyAgent\",\n", " \"retain_prcnt\": self.retain_prcnt,\n", " \"n_samples_per_episode\": self.n_samples_per_episode,\n", " }\n", "\n", " self.episode_history = {\"rewards\": [], \"state_actions\": []}\n", "\n", " def act(self, obs):\n", " r\"\"\"\n", " Generate actions according to a softmax policy.\n", "\n", " Notes\n", " -----\n", " The softmax policy assumes that the pmf over actions in state :math:`x_t` is\n", " given by:\n", "\n", " .. math::\n", "\n", " \\pi(a | x^{(t)}) = \\text{softmax}(\n", " \\text{obs}^{(t)} \\cdot \\mathbf{W}_i^{(t)} + \\mathbf{b}_i^{(t)} )\n", "\n", " where :math:`\\mathbf{W}` is a learned weight matrix, `obs` is the observation\n", " at timestep `t`, and **b** is a learned bias vector.\n", "\n", " Parameters\n", " ----------\n", " obs : int or :py:class:`ndarray `\n", " An observation from the environment.\n", "\n", " Returns\n", " -------\n", " action : int, float, or :py:class:`ndarray `\n", " An action sampled from the distribution over actions defined by the\n", " softmax policy.\n", " \"\"\"\n", " E, P = self.env_info, self.parameters\n", " W, b = P[\"W\"], P[\"b\"]\n", "\n", " s = self._obs2num[obs]\n", " s = np.array([s]) if E[\"obs_dim\"] == 1 else s\n", "\n", " # compute softmax\n", " Z = s.T @ W + b\n", " e_Z = np.exp(Z - np.max(Z, axis=-1, keepdims=True))\n", " action_probs = e_Z / e_Z.sum(axis=-1, keepdims=True)\n", "\n", " # sample action\n", " a = np.random.multinomial(1, action_probs).argmax()\n", " return self._num2action[a]\n", "\n", " def run_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Run the agent on a single episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode\n", " render : bool\n", " Whether to render the episode during training\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode, averaged over the theta samples.\n", " steps : float\n", " The total number of steps taken on the episode, averaged over the\n", " theta samples.\n", " \"\"\"\n", " self._sample_thetas()\n", "\n", " E, D = self.env_info, self.derived_variables\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", " W_len, obs_dim = D[\"W_len\"], E[\"obs_dim\"]\n", " steps, rewards = [], []\n", "\n", " for theta in D[\"theta_samples\"]:\n", " W = theta[:W_len].reshape(obs_dim, n_actions)\n", " b = theta[W_len:]\n", "\n", " total_rwd, n_steps = self._episode(W, b, max_steps, render)\n", " rewards.append(total_rwd)\n", " steps.append(n_steps)\n", "\n", " # return the average reward and average number of steps across all\n", " # samples on the current episode\n", " D[\"episode_num\"] += 1\n", " D[\"cumulative_rewards\"] = rewards\n", " return np.mean(D[\"cumulative_rewards\"]), np.mean(steps)\n", "\n", " def _episode(self, W, b, max_steps, render):\n", " \"\"\"\n", " Run the agent for an episode.\n", "\n", " Parameters\n", " ----------\n", " W : :py:class:`ndarray ` of shape `(obs_dim, n_actions)`\n", " The weights for the softmax policy.\n", " b : :py:class:`ndarray ` of shape `(bias_len, )`\n", " The bias for the softmax policy.\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during training.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The total number of steps taken on the episode.\n", " \"\"\"\n", " rwds, sa = [], []\n", " H = self.episode_history\n", " total_reward, n_steps = 0.0, 1\n", " obs = self.env.reset()\n", "\n", " self.parameters[\"W\"] = W\n", " self.parameters[\"b\"] = b\n", "\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " n_steps += 1\n", " action = self.act(obs)\n", " s, a = self._obs2num[obs], self._action2num[action]\n", " sa.append((s, a))\n", "\n", " obs, reward, done, _ = self.env.step(action)\n", " rwds.append(reward)\n", " total_reward += reward\n", "\n", " if done:\n", " break\n", "\n", " H[\"rewards\"].append(rwds)\n", " H[\"state_actions\"].append(sa)\n", " return total_reward, n_steps\n", "\n", " def update(self):\n", " r\"\"\"\n", " Update :math:`\\mu` and :math:`\\Sigma` according to the rewards accrued on\n", " the current episode.\n", "\n", " Returns\n", " -------\n", " avg_reward : float\n", " The average reward earned by the best `retain_prcnt` theta samples\n", " on the current episode.\n", " \"\"\"\n", " D, P = self.derived_variables, self.parameters\n", " n_retain = int(self.retain_prcnt * self.n_samples_per_episode)\n", "\n", " # sort the cumulative rewards for each theta sample from greatest to least\n", " sorted_y_val_idxs = np.argsort(D[\"cumulative_rewards\"])[::-1]\n", " top_idxs = sorted_y_val_idxs[:n_retain]\n", "\n", " # update theta_mean and theta_var with the best theta value\n", " P[\"theta_mean\"] = np.mean(D[\"theta_samples\"][top_idxs], axis=0)\n", " P[\"theta_var\"] = np.var(D[\"theta_samples\"][top_idxs], axis=0)\n", "\n", " def _sample_thetas(self):\n", " \"\"\"\n", " Sample `n_samples_per_episode` thetas from a multivariate Gaussian with\n", " mean `theta_mean` and covariance `diag(theta_var)`\n", " \"\"\"\n", " P, N = self.parameters, self.n_samples_per_episode\n", " Mu, Sigma = P[\"theta_mean\"], np.diag(P[\"theta_var\"])\n", " samples = np.random.multivariate_normal(Mu, Sigma, N)\n", " self.derived_variables[\"theta_samples\"] = samples\n", "\n", " def greedy_policy(self, max_steps, render=True):\n", " \"\"\"\n", " Execute a greedy policy using the current agent parameters.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during execution.\n", "\n", " Returns\n", " -------\n", " total_reward : float\n", " The total reward on the episode.\n", " n_steps : float\n", " The total number of steps taken on the episode.\n", " \"\"\"\n", " E, D, P = self.env_info, self.derived_variables, self.parameters\n", " Mu, Sigma = P[\"theta_mean\"], np.diag(P[\"theta_var\"])\n", " sample = np.random.multivariate_normal(Mu, Sigma, 1)\n", "\n", " W_len, obs_dim = D[\"W_len\"], E[\"obs_dim\"]\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " W = sample[0, :W_len].reshape(obs_dim, n_actions)\n", " b = sample[0, W_len:]\n", " total_reward, n_steps = self._episode(W, b, max_steps, render)\n", " return total_reward, n_steps\n", "\n", "\n", "class MonteCarloAgent(AgentBase):\n", " def __init__(self, env, off_policy=False, temporal_discount=0.9, epsilon=0.1):\n", " \"\"\"\n", " A Monte-Carlo learning agent trained using either first-visit Monte\n", " Carlo updates (on-policy) or incremental weighted importance sampling\n", " (off-policy).\n", "\n", " Parameters\n", " ----------\n", " env : :class:`gym.wrappers` or :class:`gym.envs` instance\n", " The environment to run the agent on.\n", " off_policy : bool\n", " Whether to use a behavior policy separate from the target policy\n", " during training. If False, use the same epsilon-soft policy for\n", " both behavior and target policies. Default is False.\n", " temporal_discount : float between [0, 1]\n", " The discount factor used for downweighting future rewards. Smaller\n", " values result in greater discounting of future rewards. Default is\n", " 0.9.\n", " epsilon : float between [0, 1]\n", " The epsilon value in the epsilon-soft policy. Larger values\n", " encourage greater exploration during training. Default is 0.1.\n", " \"\"\"\n", " super().__init__(env)\n", "\n", " self.epsilon = epsilon\n", " self.off_policy = off_policy\n", " self.temporal_discount = temporal_discount\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " E = self.env_info\n", " assert not E[\"continuous_actions\"], \"Action space must be discrete\"\n", " assert not E[\"continuous_observations\"], \"Observation space must be discrete\"\n", "\n", " n_states = np.prod(E[\"n_obs_per_dim\"])\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " self._create_2num_dicts()\n", "\n", " # behavior policy is stochastic, epsilon-soft policy\n", " self.behavior_policy = self.target_policy = self._epsilon_soft_policy\n", " if self.off_policy:\n", " self.parameters[\"C\"] = np.zeros((n_states, n_actions))\n", "\n", " # target policy is deterministic, greedy policy\n", " self.target_policy = self._greedy\n", "\n", " # initialize Q function\n", " self.parameters[\"Q\"] = np.random.rand(n_states, n_actions)\n", "\n", " # initialize returns object for each state-action pair\n", " self.derived_variables = {\n", " \"returns\": {(s, a): [] for s in range(n_states) for a in range(n_actions)},\n", " \"episode_num\": 0,\n", " }\n", "\n", " self.hyperparameters = {\n", " \"agent\": \"MonteCarloAgent\",\n", " \"epsilon\": self.epsilon,\n", " \"off_policy\": self.off_policy,\n", " \"temporal_discount\": self.temporal_discount,\n", " }\n", "\n", " self.episode_history = {\"state_actions\": [], \"rewards\": []}\n", "\n", " def _epsilon_soft_policy(self, s, a=None):\n", " r\"\"\"\n", " Epsilon-soft exploration policy.\n", "\n", " Notes\n", " -----\n", " Soft policies are necessary for first-visit Monte Carlo methods, as\n", " they require continual exploration (i.e., each state-action pair must\n", " have nonzero probability of occurring).\n", "\n", " In epsilon-soft policies, :math:`\\pi(a \\mid s) > 0` for all :math:`s\n", " \\in S` and all :math:`a \\in A(s)` at the start of training. As learning\n", " progresses, :math:`pi` gradually shifts closer and closer to a\n", " deterministic optimal policy.\n", "\n", " In particular, we have:\n", "\n", " .. math::\n", "\n", " \\pi(a \\mid s) &=\n", " 1 - \\epsilon + \\frac{\\epsilon}{|A(s)|} &&\\text{if} a = a^*\n", " \\pi(a \\mid s) &=\n", " \\frac{\\epsilon}{|A(s)|} &&\\text{if} a \\neq a^*\n", "\n", " where :math:`|A(s)|` is the number of actions available in state `s`\n", " and :math:`a^* \\in A(s)` is the greedy action in state `s` (i.e.,\n", " :math:`a^* = \\arg \\max_a Q(s, a)`).\n", "\n", " Note that epsilon-greedy policies are instances of epsilon-soft\n", " policies, defined as policies for which :math:`\\pi(a|s) \\geq \\epsilon / |A(s)|`\n", " for all states and actions.\n", "\n", " Parameters\n", " ----------\n", " s : int, float, or tuple\n", " The state number for the current observation, as returned by\n", " ``_obs2num[obs]``.\n", " a : int, float, tuple, or None\n", " The action number in the current state, as returned by\n", " ``self._action2num[obs]``. If None, sample an action from the\n", " action probabilities in state `s`, otherwise, return the\n", " probability of action `a` under the epsilon-soft policy. Default is\n", " None.\n", "\n", " Returns\n", " -------\n", " action : int, float, or :py:class:`ndarray `\n", " If `a` is None, this is an action sampled from the distribution\n", " over actions defined by the epsilon-soft policy. If `a` is not\n", " None, this is the probability of `a` under the epsilon-soft policy.\n", " \"\"\"\n", " E, P = self.env_info, self.parameters\n", "\n", " # TODO: this assumes all actions are available in every state\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " a_star = P[\"Q\"][s, :].argmax()\n", " p_a_star = 1.0 - self.epsilon + (self.epsilon / n_actions)\n", " p_a = self.epsilon / n_actions\n", "\n", " action_probs = np.ones(n_actions) * p_a\n", " action_probs[a_star] = p_a_star\n", " np.testing.assert_allclose(np.sum(action_probs), 1)\n", "\n", " if a is not None:\n", " return action_probs[a]\n", "\n", " # sample action\n", " a = np.random.multinomial(1, action_probs).argmax()\n", " return self._num2action[a]\n", "\n", " def _greedy(self, s, a=None):\n", " \"\"\"\n", " A greedy behavior policy.\n", "\n", " Notes\n", " -----\n", " Only used when off-policy is True.\n", "\n", " Parameters\n", " ----------\n", " s : int, float, or tuple\n", " The state number for the current observation, as returned by\n", " ``self._obs2num[obs]``.\n", " a : int, float, or tuple\n", " The action number in the current state, as returned by\n", " ``self._action2num[obs]``. If None, sample an action from the action\n", " probabilities in state `s`, otherwise, return the probability of\n", " action `a` under the greedy policy. Default is None.\n", "\n", " Returns\n", " -------\n", " action : int, float, or :py:class:`ndarray `\n", " If `a` is None, this is an action sampled from the distribution\n", " over actions defined by the greedy policy. If `a` is not\n", " None, this is the probability of `a` under the greedy policy.\n", " \"\"\"\n", " a_star = self.parameters[\"Q\"][s, :].argmax()\n", " if a is None:\n", " out = self._num2action[a_star]\n", " else:\n", " out = 1 if a == a_star else 0\n", " return out\n", "\n", " def _on_policy_update(self):\n", " r\"\"\"\n", " Update the `Q` function using an on-policy first-visit Monte Carlo\n", " update.\n", "\n", " Notes\n", " -----\n", " The on-policy first-visit Monte Carlo update is\n", "\n", " .. math::\n", "\n", " Q'(s, a) \\leftarrow\n", " \\text{avg}(\\text{reward following first visit to } (s, a)\n", " \\text{ across all episodes})\n", "\n", " RL agents seek to learn action values conditional on subsequent optimal\n", " behavior, but they need to behave non-optimally in order to explore all\n", " actions (to find the optimal actions).\n", "\n", " The on-policy approach is a compromise -- it learns action values not\n", " for the optimal policy, but for a *near*-optimal policy that still\n", " explores (the epsilon-soft policy).\n", " \"\"\"\n", " D, P, HS = self.derived_variables, self.parameters, self.episode_history\n", "\n", " ep_rewards = HS[\"rewards\"]\n", " sa_tuples = set(HS[\"state_actions\"])\n", "\n", " locs = [HS[\"state_actions\"].index(sa) for sa in sa_tuples]\n", " cumulative_returns = [np.sum(ep_rewards[i:]) for i in locs]\n", "\n", " # update Q value with the average of the first-visit return across\n", " # episodes\n", " for (s, a), cr in zip(sa_tuples, cumulative_returns):\n", " D[\"returns\"][(s, a)].append(cr)\n", " P[\"Q\"][s, a] = np.mean(D[\"returns\"][(s, a)])\n", "\n", " def _off_policy_update(self):\n", " \"\"\"\n", " Update `Q` using weighted importance sampling.\n", "\n", " Notes\n", " -----\n", " In importance sampling updates, we account for the fact that we are\n", " updating a different policy from the one we used to generate behavior\n", " by weighting the accumulated rewards by the ratio of the probability of\n", " the trajectory under the target policy versus its probability under\n", " the behavior policies. This is known as the importance sampling weight.\n", "\n", " In weighted importance sampling, we scale the accumulated rewards for a\n", " trajectory by their importance sampling weight, then take the\n", " *weighted* average using the importance sampling weight. This weighted\n", " average then becomes the value for the trajectory.\n", "\n", " W = importance sampling weight\n", " G_t = total discounted reward from time t until episode end\n", " C_n = sum of importance weights for the first n rewards\n", "\n", " This algorithm converges to Q* in the limit.\n", " \"\"\"\n", " P = self.parameters\n", " HS = self.episode_history\n", " ep_rewards = HS[\"rewards\"]\n", " T = len(ep_rewards)\n", "\n", " G, W = 0.0, 1.0\n", " for t in reversed(range(T)):\n", " s, a = HS[\"state_actions\"][t]\n", " G = self.temporal_discount * G + ep_rewards[t]\n", " P[\"C\"][s, a] += W\n", "\n", " # update Q(s, a) using weighted importance sampling\n", " P[\"Q\"][s, a] += (W / P[\"C\"][s, a]) * (G - P[\"Q\"][s, a])\n", "\n", " # multiply the importance sampling ratio by the current weight\n", " W *= self.target_policy(s, a) / self.behavior_policy(s, a)\n", "\n", " if W == 0.0:\n", " break\n", "\n", " def act(self, obs):\n", " r\"\"\"\n", " Execute the behavior policy--an :math:`\\epsilon`-soft policy used to\n", " generate actions during training.\n", "\n", " Parameters\n", " ----------\n", " obs : int, float, or :py:class:`ndarray ` as returned by ``env.step(action)``\n", " An observation from the environment.\n", "\n", " Returns\n", " -------\n", " action : int, float, or :py:class:`ndarray `\n", " An action sampled from the distribution over actions defined by the\n", " epsilon-soft policy.\n", " \"\"\" # noqa: E501\n", " s = self._obs2num[obs]\n", " return self.behavior_policy(s)\n", "\n", " def run_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Run the agent on a single episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode.\n", " render : bool\n", " Whether to render the episode during training.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " D = self.derived_variables\n", " total_rwd, n_steps = self._episode(max_steps, render)\n", "\n", " D[\"episode_num\"] += 1\n", " return total_rwd, n_steps\n", "\n", " def _episode(self, max_steps, render):\n", " \"\"\"\n", " Execute agent on an episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during training.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " obs = self.env.reset()\n", " HS = self.episode_history\n", " total_reward, n_steps = 0.0, 0\n", "\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " n_steps += 1\n", " action = self.act(obs)\n", "\n", " s = self._obs2num[obs]\n", " a = self._action2num[action]\n", "\n", " # store (state, action) tuple\n", " HS[\"state_actions\"].append((s, a))\n", "\n", " # take action\n", " obs, reward, done, info = self.env.step(action)\n", "\n", " # record rewards\n", " HS[\"rewards\"].append(reward)\n", " total_reward += reward\n", "\n", " if done:\n", " break\n", "\n", " return total_reward, n_steps\n", "\n", " def update(self):\n", " \"\"\"\n", " Update the parameters of the model following the completion of an\n", " episode. Flush the episode history after the update is complete.\n", " \"\"\"\n", " H = self.hyperparameters\n", " if H[\"off_policy\"]:\n", " self._off_policy_update()\n", " else:\n", " self._on_policy_update()\n", "\n", " self.flush_history()\n", "\n", " def greedy_policy(self, max_steps, render=True):\n", " \"\"\"\n", " Execute a greedy policy using the current agent parameters.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during execution.\n", "\n", " Returns\n", " -------\n", " total_reward : float\n", " The total reward on the episode.\n", " n_steps : float\n", " The total number of steps taken on the episode.\n", " \"\"\"\n", " H = self.episode_history\n", " obs = self.env.reset()\n", "\n", " total_reward, n_steps = 0.0, 0\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " n_steps += 1\n", " action = self._greedy(obs)\n", "\n", " s = self._obs2num[obs]\n", " a = self._action2num[action]\n", "\n", " # store (state, action) tuple\n", " H[\"state_actions\"].append((s, a))\n", "\n", " # take action\n", " obs, reward, done, info = self.env.step(action)\n", "\n", " # record rewards\n", " H[\"rewards\"].append(reward)\n", " total_reward += reward\n", "\n", " if done:\n", " break\n", "\n", " return total_reward, n_steps\n", "\n", "\n", "class TemporalDifferenceAgent(AgentBase):\n", " def __init__(\n", " self,\n", " env,\n", " lr=0.4,\n", " epsilon=0.1,\n", " n_tilings=8,\n", " obs_max=None,\n", " obs_min=None,\n", " grid_dims=[8, 8],\n", " off_policy=False,\n", " temporal_discount=0.99,\n", " ):\n", " r\"\"\"\n", " A temporal difference learning agent with expected SARSA (on-policy) [3]_ or\n", " TD(0) `Q`-learning (off-policy) [4]_ updates.\n", "\n", " Notes\n", " -----\n", " The expected SARSA on-policy TD(0) update is:\n", "\n", " .. math::\n", "\n", " Q(s, a) \\leftarrow Q(s, a) + \\eta \\left(\n", " r + \\gamma \\mathbb{E}_\\pi[Q(s', a') \\mid s'] - Q(s, a)\n", " \\right)\n", "\n", " and the TD(0) off-policy Q-learning upate is:\n", "\n", " .. math::\n", "\n", " Q(s, a) \\leftarrow Q(s, a) + \\eta (\n", " r + \\gamma \\max_a \\left\\{ Q(s', a) \\right\\} - Q(s, a)\n", " )\n", "\n", " where in each case we have taken action `a` in state `s`, received\n", " reward `r`, and transitioned into state :math:`s'`. In the above\n", " equations, :math:`\\eta` is a learning rate parameter, :math:`\\gamma` is\n", " a temporal discount factor, and :math:`\\mathbb{E}_\\pi[ Q[s', a'] \\mid\n", " s']` is the expected value under the current policy :math:`\\pi` of the\n", " Q function conditioned that we are in state :math:`s'`.\n", "\n", " Observe that the expected SARSA update can be used for both on- and\n", " off-policy methods. In an off-policy context, if the target policy is\n", " greedy and the expectation is taken wrt. the target policy then the\n", " expected SARSA update is exactly Q-learning.\n", "\n", " NB. For this implementation the agent requires a discrete action\n", " space, but will try to discretize the observation space via tiling if\n", " it is continuous.\n", "\n", " References\n", " ----------\n", " .. [3] Rummery, G. & Niranjan, M. (1994). *On-Line Q-learning Using\n", " Connectionist Systems*. Tech Report 166. Cambridge University\n", " Department of Engineering.\n", " .. [4] Watkins, C. (1989). Learning from delayed rewards. *PhD thesis,\n", " Cambridge University*.\n", "\n", " Parameters\n", " ----------\n", " env : gym.wrappers or gym.envs instance\n", " The environment to run the agent on.\n", " lr : float\n", " Learning rate for the Q function updates. Default is 0.05.\n", " epsilon : float between [0, 1]\n", " The epsilon value in the epsilon-soft policy. Larger values\n", " encourage greater exploration during training. Default is 0.1.\n", " n_tilings : int\n", " The number of overlapping tilings to use if the ``env`` observation\n", " space is continuous. Unused if observation space is discrete.\n", " Default is 8.\n", " obs_max : float or :py:class:`ndarray `\n", " The value to treat as the max value of the observation space when\n", " calculating the grid widths if the observation space is continuous.\n", " If None, use ``env.observation_space.high``. Unused if observation\n", " space is discrete. Default is None.\n", " obs_min : float or :py:class:`ndarray `\n", " The value to treat as the min value of the observation space when\n", " calculating grid widths if the observation space is continuous. If\n", " None, use ``env.observation_space.low``. Unused if observation\n", " space is discrete. Default is None.\n", " grid_dims : list\n", " The number of rows and columns in each tiling grid if the env\n", " observation space is continuous. Unused if observation space is\n", " discrete. Default is [8, 8].\n", " off_policy : bool\n", " Whether to use a behavior policy separate from the target policy\n", " during training. If False, use the same epsilon-soft policy for\n", " both behavior and target policies. Default is False.\n", " temporal_discount : float between [0, 1]\n", " The discount factor used for downweighting future rewards. Smaller\n", " values result in greater discounting of future rewards. Default is\n", " 0.9.\n", " \"\"\"\n", " super().__init__(env)\n", "\n", " self.lr = lr\n", " self.obs_max = obs_max\n", " self.obs_min = obs_min\n", " self.epsilon = epsilon\n", " self.n_tilings = n_tilings\n", " self.grid_dims = grid_dims\n", " self.off_policy = off_policy\n", " self.temporal_discount = temporal_discount\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " E = self.env_info\n", " assert not E[\"continuous_actions\"], \"Action space must be discrete\"\n", "\n", " obs_encoder = None\n", " if E[\"continuous_observations\"]:\n", " obs_encoder, _ = tile_state_space(\n", " self.env,\n", " self.env_info,\n", " self.n_tilings,\n", " state_action=False,\n", " obs_max=self.obs_max,\n", " obs_min=self.obs_min,\n", " grid_size=self.grid_dims,\n", " )\n", "\n", " self._create_2num_dicts(obs_encoder=obs_encoder)\n", "\n", " # behavior policy is stochastic, epsilon-soft policy\n", " self.behavior_policy = self.target_policy = self._epsilon_soft_policy\n", " if self.off_policy:\n", " # target policy is deterministic, greedy policy\n", " self.target_policy = self._greedy\n", "\n", " # initialize Q function\n", " self.parameters[\"Q\"] = defaultdict(np.random.rand)\n", "\n", " # initialize returns object for each state-action pair\n", " self.derived_variables = {\"episode_num\": 0}\n", "\n", " self.hyperparameters = {\n", " \"agent\": \"TemporalDifferenceAgent\",\n", " \"lr\": self.lr,\n", " \"obs_max\": self.obs_max,\n", " \"obs_min\": self.obs_min,\n", " \"epsilon\": self.epsilon,\n", " \"n_tilings\": self.n_tilings,\n", " \"grid_dims\": self.grid_dims,\n", " \"off_policy\": self.off_policy,\n", " \"temporal_discount\": self.temporal_discount,\n", " }\n", "\n", " self.episode_history = {\"state_actions\": [], \"rewards\": []}\n", "\n", " def run_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Run the agent on a single episode without updating the priority queue\n", " or performing backups.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode\n", " render : bool\n", " Whether to render the episode during training\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode, averaged over the theta samples.\n", " steps : float\n", " The total number of steps taken on the episode, averaged over the\n", " theta samples.\n", " \"\"\"\n", " return self._episode(max_steps, render, update=False)\n", "\n", " def train_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Train the agent on a single episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode.\n", " render : bool\n", " Whether to render the episode during training.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " D = self.derived_variables\n", " total_rwd, n_steps = self._episode(max_steps, render, update=True)\n", "\n", " D[\"episode_num\"] += 1\n", "\n", " return total_rwd, n_steps\n", "\n", " def _episode(self, max_steps, render, update=True):\n", " \"\"\"\n", " Run or train the agent on an episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during training.\n", " update : bool\n", " Whether to perform the Q function backups after each step. Default\n", " is True.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " self.flush_history()\n", "\n", " obs = self.env.reset()\n", " HS = self.episode_history\n", "\n", " action = self.act(obs)\n", " s = self._obs2num[obs]\n", " a = self._action2num[action]\n", "\n", " # store initial (state, action) tuple\n", " HS[\"state_actions\"].append((s, a))\n", "\n", " total_reward, n_steps = 0.0, 0\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " # take action\n", " obs, reward, done, info = self.env.step(action)\n", " n_steps += 1\n", "\n", " # record rewards\n", " HS[\"rewards\"].append(reward)\n", " total_reward += reward\n", "\n", " # generate next state and action\n", " action = self.act(obs)\n", " s_ = self._obs2num[obs] if not done else None\n", " a_ = self._action2num[action]\n", "\n", " # store next (state, action) tuple\n", " HS[\"state_actions\"].append((s_, a_))\n", "\n", " if update:\n", " self.update()\n", "\n", " if done:\n", " break\n", "\n", " return total_reward, n_steps\n", "\n", " def _epsilon_soft_policy(self, s, a=None):\n", " r\"\"\"\n", " Epsilon-soft exploration policy.\n", "\n", " In epsilon-soft policies, :math:`\\pi(a|s) > 0` for all s \u2208 S and all a\n", " \u2208 A(s) at the start of training. As learning progresses, :math:`\\pi`\n", " gradually shifts closer and closer to a deterministic optimal policy.\n", "\n", " In particular, we have:\n", "\n", " pi(a|s) = 1 - epsilon + (epsilon / |A(s)|) IFF a == a*\n", " pi(a|s) = epsilon / |A(s)| IFF a != a*\n", "\n", " where\n", "\n", " |A(s)| is the number of actions available in state s\n", " a* \u2208 A(s) is the greedy action in state s (i.e., a* = argmax_a Q(s, a))\n", "\n", " Note that epsilon-greedy policies are instances of epsilon-soft\n", " policies, defined as policies for which pi(a|s) >= epsilon / |A(s)| for\n", " all states and actions.\n", "\n", " Parameters\n", " ----------\n", " s : int, float, or tuple\n", " The state number for the current observation, as returned by\n", " ``self._obs2num[obs]``\n", " a : int, float, or tuple\n", " The action number in the current state, as returned by\n", " self._action2num[obs]. If None, sample an action from the action\n", " probabilities in state s, otherwise, return the probability of\n", " action `a` under the epsilon-soft policy. Default is None.\n", "\n", " Returns\n", " -------\n", " If `a` is None:\n", " action : int, float, or :py:class:`ndarray ` as returned by `self._num2action`\n", " If `a` is None, returns an action sampled from the distribution\n", " over actions defined by the epsilon-soft policy.\n", "\n", " If `a` is not None:\n", " action_prob : float in range [0, 1]\n", " If `a` is not None, returns the probability of `a` under the\n", " epsilon-soft policy.\n", " \"\"\" # noqa: E501\n", " E, P = self.env_info, self.parameters\n", "\n", " # TODO: this assumes all actions are available in every state\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " a_star = np.argmax([P[\"Q\"][(s, aa)] for aa in range(n_actions)])\n", " p_a_star = 1.0 - self.epsilon + (self.epsilon / n_actions)\n", " p_a = self.epsilon / n_actions\n", "\n", " action_probs = np.ones(n_actions) * p_a\n", " action_probs[a_star] = p_a_star\n", " np.testing.assert_allclose(np.sum(action_probs), 1)\n", "\n", " if a is not None:\n", " return action_probs[a]\n", "\n", " # sample action\n", " a = np.random.multinomial(1, action_probs).argmax()\n", " return self._num2action[a]\n", "\n", " def _greedy(self, s, a=None):\n", " \"\"\"\n", " A greedy behavior policy. Only used when off-policy is true.\n", "\n", " Parameters\n", " ----------\n", " s : int, float, or tuple\n", " The state number for the current observation, as returned by\n", " ``self._obs2num[obs]``\n", " a : int, float, or tuple\n", " The action number in the current state, as returned by\n", " ``self._action2num[obs]``. If None, sample an action from the\n", " action probabilities in state `s`, otherwise, return the\n", " probability of action `a` under the greedy policy. Default is None.\n", "\n", " Returns\n", " -------\n", " If `a` is None:\n", " action : int, float, or :py:class:`ndarray ` as returned by ``self._num2action``\n", " If `a` is None, returns an action sampled from the distribution\n", " over actions defined by the greedy policy.\n", "\n", " If `a` is not None:\n", " action_prob : float in range [0, 1]\n", " If `a` is not None, returns the probability of `a` under the\n", " greedy policy.\n", " \"\"\" # noqa: E501\n", " P, E = self.parameters, self.env_info\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", " a_star = np.argmax([P[\"Q\"][(s, aa)] for aa in range(n_actions)])\n", " if a is None:\n", " out = self._num2action[a_star]\n", " else:\n", " out = 1 if a == a_star else 0\n", " return out\n", "\n", " def _on_policy_update(self, s, a, r, s_, a_):\n", " \"\"\"\n", " Update the Q function using the expected SARSA on-policy TD(0) update:\n", "\n", " Q[s, a] <- Q[s, a] + lr * [\n", " r + temporal_discount * E[Q[s', a'] | s'] - Q[s, a]\n", " ]\n", "\n", " where\n", "\n", " E[ Q[s', a'] | s'] is the expected value of the Q function over all\n", " a_ given that we're in state s' under the current policy\n", "\n", " NB. the expected SARSA update can be used for both on- and off-policy\n", " methods. In an off-policy context, if the target policy is greedy and\n", " the expectation is taken wrt. the target policy then the expected SARSA\n", " update is exactly Q-learning.\n", "\n", " Parameters\n", " ----------\n", " s : int as returned by `self._obs2num`\n", " The id for the state/observation at timestep t-1\n", " a : int as returned by `self._action2num`\n", " The id for the action taken at timestep t-1\n", " r : float\n", " The reward after taking action `a` in state `s` at timestep t-1\n", " s_ : int as returned by `self._obs2num`\n", " The id for the state/observation at timestep t\n", " a_ : int as returned by `self._action2num`\n", " The id for the action taken at timestep t\n", " \"\"\"\n", " Q, E, pi = self.parameters[\"Q\"], self.env_info, self.behavior_policy\n", "\n", " # TODO: this assumes that all actions are available in each state\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " # compute the expected value of Q(s', a') given that we are in state s'\n", " E_Q = np.sum([pi(s_, aa) * Q[(s_, aa)] for aa in range(n_actions)]) if s_ else 0\n", "\n", " # perform the expected SARSA TD(0) update\n", " qsa = Q[(s, a)]\n", " Q[(s, a)] = qsa + self.lr * (r + self.temporal_discount * E_Q - qsa)\n", "\n", " def _off_policy_update(self, s, a, r, s_):\n", " \"\"\"\n", " Update the `Q` function using the TD(0) Q-learning update:\n", "\n", " Q[s, a] <- Q[s, a] + lr * (\n", " r + temporal_discount * max_a { Q[s', a] } - Q[s, a]\n", " )\n", "\n", " Parameters\n", " ----------\n", " s : int as returned by `self._obs2num`\n", " The id for the state/observation at timestep `t-1`\n", " a : int as returned by `self._action2num`\n", " The id for the action taken at timestep `t-1`\n", " r : float\n", " The reward after taking action `a` in state `s` at timestep `t-1`\n", " s_ : int as returned by `self._obs2num`\n", " The id for the state/observation at timestep `t`\n", " \"\"\"\n", " Q, E = self.parameters[\"Q\"], self.env_info\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " qsa = Q[(s, a)]\n", " Qs_ = [Q[(s_, aa)] for aa in range(n_actions)] if s_ else [0]\n", " Q[(s, a)] = qsa + self.lr * (r + self.temporal_discount * np.max(Qs_) - qsa)\n", "\n", " def update(self):\n", " \"\"\"Update the parameters of the model online after each new state-action.\"\"\"\n", " H, HS = self.hyperparameters, self.episode_history\n", " (s, a), r = HS[\"state_actions\"][-2], HS[\"rewards\"][-1]\n", " s_, a_ = HS[\"state_actions\"][-1]\n", "\n", " if H[\"off_policy\"]:\n", " self._off_policy_update(s, a, r, s_)\n", " else:\n", " self._on_policy_update(s, a, r, s_, a_)\n", "\n", " def act(self, obs):\n", " r\"\"\"\n", " Execute the behavior policy--an :math:`\\epsilon`-soft policy used to\n", " generate actions during training.\n", "\n", " Parameters\n", " ----------\n", " obs : int, float, or :py:class:`ndarray ` as returned by ``env.step(action)``\n", " An observation from the environment.\n", "\n", " Returns\n", " -------\n", " action : int, float, or :py:class:`ndarray `\n", " An action sampled from the distribution over actions defined by the\n", " epsilon-soft policy.\n", " \"\"\" # noqa: E501\n", " s = self._obs2num[obs]\n", " return self.behavior_policy(s)\n", "\n", " def greedy_policy(self, max_steps, render=True):\n", " \"\"\"\n", " Execute a deterministic greedy policy using the current agent\n", " parameters.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during execution.\n", "\n", " Returns\n", " -------\n", " total_reward : float\n", " The total reward on the episode.\n", " n_steps : float\n", " The total number of steps taken on the episode.\n", " \"\"\"\n", " self.flush_history()\n", "\n", " H = self.episode_history\n", " obs = self.env.reset()\n", "\n", " total_reward, n_steps = 0.0, 0\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " s = self._obs2num[obs]\n", " action = self._greedy(s)\n", " a = self._action2num[action]\n", "\n", " # store (state, action) tuple\n", " H[\"state_actions\"].append((s, a))\n", "\n", " # take action\n", " obs, reward, done, info = self.env.step(action)\n", " n_steps += 1\n", "\n", " # record rewards\n", " H[\"rewards\"].append(reward)\n", " total_reward += reward\n", "\n", " if done:\n", " break\n", "\n", " return total_reward, n_steps\n", "\n", "\n", "class DynaAgent(AgentBase):\n", " def __init__(\n", " self,\n", " env,\n", " lr=0.4,\n", " epsilon=0.1,\n", " n_tilings=8,\n", " obs_max=None,\n", " obs_min=None,\n", " q_plus=False,\n", " grid_dims=[8, 8],\n", " explore_weight=0.05,\n", " temporal_discount=0.9,\n", " n_simulated_actions=50,\n", " ):\n", " r\"\"\"\n", " A Dyna-`Q` / Dyna-`Q+` agent [5]_ with full TD(0) `Q`-learning updates via\n", " prioritized-sweeping [6]_ .\n", "\n", " Notes\n", " -----\n", " This approach consists of three components: a planning method involving\n", " simulated actions, a direct RL method where the agent directly interacts\n", " with the environment, and a model-learning method where the agent\n", " learns to better represent the environment during planning.\n", "\n", " During planning, the agent performs random-sample one-step tabular\n", " Q-planning with prioritized sweeping. This entails using a priority\n", " queue to retrieve the state-action pairs from the agent's history which\n", " would stand to have the largest change to their Q-values if backed up.\n", " Specifically, for state action pair `(s, a)` the priority value is:\n", "\n", " .. math::\n", "\n", " P = \\sum_{s'} p(s') | r + \\gamma \\max_a \\{Q(s', a) \\} - Q(s, a) |\n", "\n", " which corresponds to the absolute magnitude of the TD(0) Q-learning\n", " backup for the pair.\n", "\n", " When the first pair in the queue is backed up, the effect on each of\n", " its predecessor pairs is computed. If the predecessor's priority is\n", " greater than a small threshold the pair is added to the queue and the\n", " process is repeated until either the queue is empty or we have exceeded\n", " `n_simulated_actions` updates. These backups occur without the agent\n", " taking any action in the environment and thus constitute simulations\n", " based on the agent's current model of the environment (i.e., its\n", " tabular state-action history).\n", "\n", " During the direct RL phase, the agent takes an action based on its\n", " current behavior policy and Q function and receives a reward from the\n", " environment. The agent logs this state-action-reward-new state tuple in\n", " its interaction table (i.e., environment model) and updates its Q\n", " function using a full-backup version of the Q-learning update:\n", "\n", " .. math::\n", "\n", " Q(s, a) \\leftarrow Q(s, a) + \\eta \\sum_{r, s'} p(r, s' \\mid s, a)\n", " \\left(r + \\gamma \\max_a \\{ Q(s', a) \\} - Q(s, a) \\right)\n", "\n", " References\n", " ----------\n", " .. [5] Sutton, R. (1990). Integrated architectures for learning,\n", " planning, and reacting based on approximating dynamic programming.\n", " In *Proceedings of the 7th Annual ICML*, 216-224.\n", " .. [6] Moore, A. & Atkeson, C. (1993). Prioritized sweeping:\n", " Reinforcement learning with less data and less time. *Machine\n", " Learning, 13(1)*, 103-130.\n", "\n", " Parameters\n", " ----------\n", " env : :class:`gym.wrappers` or :class:`gym.envs` instance\n", " The environment to run the agent on\n", " lr : float\n", " Learning rate for the `Q` function updates. Default is 0.05.\n", " epsilon : float between [0, 1]\n", " The epsilon value in the epsilon-soft policy. Larger values\n", " encourage greater exploration during training. Default is 0.1.\n", " n_tilings : int\n", " The number of overlapping tilings to use if the env observation\n", " space is continuous. Unused if observation space is discrete.\n", " Default is 8.\n", " obs_max : float or :py:class:`ndarray ` or None\n", " The value to treat as the max value of the observation space when\n", " calculating the grid widths if the observation space is continuous.\n", " If None, use :meth:`env.observation_space.high`. Unused if observation\n", " space is discrete. Default is None.\n", " obs_min : float or :py:class:`ndarray ` or None\n", " The value to treat as the min value of the observation space when\n", " calculating grid widths if the observation space is continuous. If\n", " None, use :meth:`env.observation_space.low`. Unused if observation\n", " space is discrete. Default is None.\n", " grid_dims : list\n", " The number of rows and columns in each tiling grid if the env\n", " observation space is continuous. Unused if observation space is\n", " discrete. Default is `[8, 8]`.\n", " q_plus : bool\n", " Whether to add incentives for visiting states that the agent hasn't\n", " encountered recently. Default is False.\n", " explore_weight : float\n", " Amount to incentivize exploring states that the agent hasn't\n", " recently visited. Only used if `q_plus` is True. Default is 0.05.\n", " temporal_discount : float between [0, 1]\n", " The discount factor used for downweighting future rewards. Smaller\n", " values result in greater discounting of future rewards. Default is\n", " 0.9.\n", " n_simulated_actions : int\n", " THe number of simulated actions to perform for each \"real\" action.\n", " Default is 50.\n", " \"\"\"\n", " super().__init__(env)\n", "\n", " self.lr = lr\n", " self.q_plus = q_plus\n", " self.obs_max = obs_max\n", " self.obs_min = obs_min\n", " self.epsilon = epsilon\n", " self.n_tilings = n_tilings\n", " self.grid_dims = grid_dims\n", " self.explore_weight = explore_weight\n", " self.temporal_discount = temporal_discount\n", " self.n_simulated_actions = n_simulated_actions\n", "\n", " self._init_params()\n", "\n", " def _init_params(self):\n", " E = self.env_info\n", " assert not E[\"continuous_actions\"], \"Action space must be discrete\"\n", "\n", " obs_encoder = None\n", " if E[\"continuous_observations\"]:\n", " obs_encoder, _ = tile_state_space(\n", " self.env,\n", " self.env_info,\n", " self.n_tilings,\n", " state_action=False,\n", " obs_max=self.obs_max,\n", " obs_min=self.obs_min,\n", " grid_size=self.grid_dims,\n", " )\n", "\n", " self._create_2num_dicts(obs_encoder=obs_encoder)\n", " self.behavior_policy = self.target_policy = self._epsilon_soft_policy\n", "\n", " # initialize Q function and model\n", " self.parameters[\"Q\"] = defaultdict(np.random.rand)\n", " self.parameters[\"model\"] = EnvModel()\n", "\n", " # initialize returns object for each state-action pair\n", " self.derived_variables = {\n", " \"episode_num\": 0,\n", " \"sweep_queue\": {},\n", " \"visited\": set(),\n", " \"steps_since_last_visit\": defaultdict(lambda: 0),\n", " }\n", "\n", " if self.q_plus:\n", " self.derived_variables[\"steps_since_last_visit\"] = defaultdict(\n", " np.random.rand,\n", " )\n", "\n", " self.hyperparameters = {\n", " \"agent\": \"DynaAgent\",\n", " \"lr\": self.lr,\n", " \"q_plus\": self.q_plus,\n", " \"obs_max\": self.obs_max,\n", " \"obs_min\": self.obs_min,\n", " \"epsilon\": self.epsilon,\n", " \"n_tilings\": self.n_tilings,\n", " \"grid_dims\": self.grid_dims,\n", " \"explore_weight\": self.explore_weight,\n", " \"temporal_discount\": self.temporal_discount,\n", " \"n_simulated_actions\": self.n_simulated_actions,\n", " }\n", "\n", " self.episode_history = {\"state_actions\": [], \"rewards\": []}\n", "\n", " def act(self, obs):\n", " r\"\"\"\n", " Execute the behavior policy--an :math:`\\epsilon`-soft policy used to\n", " generate actions during training.\n", "\n", " Parameters\n", " ----------\n", " obs : int, float, or :py:class:`ndarray ` as returned by ``env.step(action)``\n", " An observation from the environment.\n", "\n", " Returns\n", " -------\n", " action : int, float, or :py:class:`ndarray `\n", " An action sampled from the distribution over actions defined by the\n", " epsilon-soft policy.\n", " \"\"\" # noqa: E501\n", " s = self._obs2num[obs]\n", " return self.behavior_policy(s)\n", "\n", " def _epsilon_soft_policy(self, s, a=None):\n", " \"\"\"\n", " Epsilon-soft exploration policy.\n", "\n", " In epsilon-soft policies, pi(a|s) > 0 for all s \u2208 S and all a \u2208 A(s) at\n", " the start of training. As learning progresses, pi gradually shifts\n", " closer and closer to a deterministic optimal policy.\n", "\n", " In particular, we have:\n", "\n", " pi(a|s) = 1 - epsilon + (epsilon / |A(s)|) IFF a == a*\n", " pi(a|s) = epsilon / |A(s)| IFF a != a*\n", "\n", " where\n", "\n", " |A(s)| is the number of actions available in state s\n", " a* \u2208 A(s) is the greedy action in state s (i.e., a* = argmax_a Q(s, a))\n", "\n", " Note that epsilon-greedy policies are instances of epsilon-soft\n", " policies, defined as policies for which pi(a|s) >= epsilon / |A(s)| for\n", " all states and actions.\n", "\n", " Parameters\n", " ----------\n", " s : int, float, or tuple\n", " The state number for the current observation, as returned by\n", " self._obs2num[obs]\n", " a : int, float, or tuple\n", " The action number in the current state, as returned by\n", " self._action2num[obs]. If None, sample an action from the action\n", " probabilities in state s, otherwise, return the probability of\n", " action `a` under the epsilon-soft policy. Default is None.\n", "\n", " Returns\n", " -------\n", " If `a` is None:\n", " action : int, float, or :py:class:`ndarray ` as returned by :meth:`_num2action`\n", " If `a` is None, returns an action sampled from the distribution\n", " over actions defined by the epsilon-soft policy.\n", "\n", " If `a` is not None:\n", " action_prob : float in range [0, 1]\n", " If `a` is not None, returns the probability of `a` under the\n", " epsilon-soft policy.\n", " \"\"\" # noqa: E501\n", " E, P = self.env_info, self.parameters\n", "\n", " # TODO: this assumes all actions are available in every state\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " a_star = np.argmax([P[\"Q\"][(s, aa)] for aa in range(n_actions)])\n", " p_a_star = 1.0 - self.epsilon + (self.epsilon / n_actions)\n", " p_a = self.epsilon / n_actions\n", "\n", " action_probs = np.ones(n_actions) * p_a\n", " action_probs[a_star] = p_a_star\n", " np.testing.assert_allclose(np.sum(action_probs), 1)\n", "\n", " if a is not None:\n", " return action_probs[a]\n", "\n", " # sample action\n", " a = np.random.multinomial(1, action_probs).argmax()\n", " return self._num2action[a]\n", "\n", " def _greedy(self, s, a=None):\n", " \"\"\"\n", " A greedy behavior policy.\n", "\n", " Parameters\n", " ----------\n", " s : int, float, or tuple\n", " The state number for the current observation, as returned by\n", " self._obs2num[obs]\n", " a : int, float, or tuple\n", " The action number in the current state, as returned by\n", " self._action2num[obs]. If None, sample an action from the action\n", " probabilities in state s, otherwise, return the probability of\n", " action `a` under the greedy policy. Default is None.\n", "\n", " Returns\n", " -------\n", " If `a` is None:\n", " action : int, float, or :py:class:`ndarray ` as returned by :meth:`_num2action`\n", " If `a` is None, returns an action sampled from the distribution\n", " over actions defined by the greedy policy.\n", "\n", " If `a` is not None:\n", " action_prob : float in range [0, 1]\n", " If `a` is not None, returns the probability of `a` under the\n", " greedy policy.\n", " \"\"\" # noqa: E501\n", " E, Q = self.env_info, self.parameters[\"Q\"]\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", " a_star = np.argmax([Q[(s, aa)] for aa in range(n_actions)])\n", " if a is None:\n", " out = self._num2action[a_star]\n", " else:\n", " out = 1 if a == a_star else 0\n", " return out\n", "\n", " def update(self):\n", " \"\"\"\n", " Update the priority queue with the most recent (state, action) pair and\n", " perform random-sample one-step tabular Q-planning.\n", "\n", " Notes\n", " -----\n", " The planning algorithm uses a priority queue to retrieve the\n", " state-action pairs from the agent's history which will result in the\n", " largest change to its `Q`-value if backed up. When the first pair in\n", " the queue is backed up, the effect on each of its predecessor pairs is\n", " computed. If the predecessor's priority is greater than a small\n", " threshold the pair is added to the queue and the process is repeated\n", " until either the queue is empty or we exceed `n_simulated_actions`\n", " updates.\n", " \"\"\"\n", " s, a = self.episode_history[\"state_actions\"][-1]\n", " self._update_queue(s, a)\n", " self._simulate_behavior()\n", "\n", " def _update_queue(self, s, a):\n", " \"\"\"\n", " Update the priority queue by calculating the priority for (s, a) and\n", " inserting it into the queue if it exceeds a fixed (small) threshold.\n", "\n", " Parameters\n", " ----------\n", " s : int as returned by `self._obs2num`\n", " The id for the state/observation\n", " a : int as returned by `self._action2num`\n", " The id for the action taken from state `s`\n", " \"\"\"\n", " sweep_queue = self.derived_variables[\"sweep_queue\"]\n", "\n", " # TODO: what's a good threshold here?\n", " priority = self._calc_priority(s, a)\n", " if priority >= 0.001:\n", " if (s, a) in sweep_queue:\n", " sweep_queue[(s, a)] = max(priority, sweep_queue[(s, a)])\n", " else:\n", " sweep_queue[(s, a)] = priority\n", "\n", " def _calc_priority(self, s, a):\n", " \"\"\"\n", " Compute the \"priority\" for state-action pair (s, a). The priority P is\n", " defined as:\n", "\n", " P = sum_{s_} p(s_) * abs(r + temporal_discount * max_a {Q[s_, a]} - Q[s, a])\n", "\n", " which corresponds to the absolute magnitude of the TD(0) Q-learning\n", " backup for (s, a).\n", "\n", " Parameters\n", " ----------\n", " s : int as returned by `self._obs2num`\n", " The id for the state/observation\n", " a : int as returned by `self._action2num`\n", " The id for the action taken from state `s`\n", "\n", " Returns\n", " -------\n", " priority : float\n", " The absolute magnitude of the full-backup TD(0) Q-learning update\n", " for (s, a)\n", " \"\"\"\n", " priority = 0.0\n", " E = self.env_info\n", " Q = self.parameters[\"Q\"]\n", " env_model = self.parameters[\"model\"]\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " outcome_probs = env_model.outcome_probs(s, a)\n", " for (r, s_), p_rs_ in outcome_probs:\n", " max_q = np.max([Q[(s_, aa)] for aa in range(n_actions)])\n", " P = p_rs_ * (r + self.temporal_discount * max_q - Q[(s, a)])\n", " priority += np.abs(P)\n", " return priority\n", "\n", " def _simulate_behavior(self):\n", " \"\"\"\n", " Perform random-sample one-step tabular Q-planning with prioritized\n", " sweeping.\n", "\n", " Notes\n", " -----\n", " This approach uses a priority queue to retrieve the state-action pairs\n", " from the agent's history with largest change to their Q-values if\n", " backed up. When the first pair in the queue is backed up, the effect on\n", " each of its predecessor pairs is computed. If the predecessor's\n", " priority is greater than a small threshold the pair is added to the\n", " queue and the process is repeated until either the queue is empty or we\n", " have exceeded a `n_simulated_actions` updates.\n", " \"\"\"\n", " env_model = self.parameters[\"model\"]\n", " sweep_queue = self.derived_variables[\"sweep_queue\"]\n", " for _ in range(self.n_simulated_actions):\n", " if len(sweep_queue) == 0:\n", " break\n", "\n", " # select (s, a) pair with the largest update (priority)\n", " sq_items = list(sweep_queue.items())\n", " (s_sim, a_sim), _ = sorted(sq_items, key=lambda x: x[1], reverse=True)[0]\n", "\n", " # remove entry from queue\n", " del sweep_queue[(s_sim, a_sim)]\n", "\n", " # update Q function for (s_sim, a_sim) using the full-backup\n", " # version of the TD(0) Q-learning update\n", " self._update(s_sim, a_sim)\n", "\n", " # get all (_s, _a) pairs that lead to s_sim (ie., s_sim's predecessors)\n", " pairs = env_model.state_action_pairs_leading_to_outcome(s_sim)\n", "\n", " # add predecessors to queue if their priority exceeds thresh\n", " for (_s, _a) in pairs:\n", " self._update_queue(_s, _a)\n", "\n", " def _update(self, s, a):\n", " \"\"\"\n", " Update Q using a full-backup version of the TD(0) Q-learning update:\n", "\n", " Q(s, a) = Q(s, a) + lr *\n", " sum_{r, s'} [\n", " p(r, s' | s, a) * (r + gamma * max_a { Q(s', a) } - Q(s, a))\n", " ]\n", "\n", " Parameters\n", " ----------\n", " s : int as returned by ``self._obs2num``\n", " The id for the state/observation\n", " a : int as returned by ``self._action2num``\n", " The id for the action taken from state `s`\n", " \"\"\"\n", " update = 0.0\n", " env_model = self.parameters[\"model\"]\n", " E, D, Q = self.env_info, self.derived_variables, self.parameters[\"Q\"]\n", " n_actions = np.prod(E[\"n_actions_per_dim\"])\n", "\n", " # sample rewards from the model\n", " outcome_probs = env_model.outcome_probs(s, a)\n", " for (r, s_), p_rs_ in outcome_probs:\n", " # encourage visiting long-untried actions by adding a \"bonus\"\n", " # reward proportional to the sqrt of the time since last visit\n", " if self.q_plus:\n", " r += self.explore_weight * np.sqrt(D[\"steps_since_last_visit\"][(s, a)])\n", "\n", " max_q = np.max([Q[(s_, a_)] for a_ in range(n_actions)])\n", " update += p_rs_ * (r + self.temporal_discount * max_q - Q[(s, a)])\n", "\n", " # update Q value for (s, a) pair\n", " Q[(s, a)] += self.lr * update\n", "\n", " def run_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Run the agent on a single episode without performing `Q`-function\n", " backups.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode.\n", " render : bool\n", " Whether to render the episode during training.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " return self._episode(max_steps, render, update=False)\n", "\n", " def train_episode(self, max_steps, render=False):\n", " \"\"\"\n", " Train the agent on a single episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run an episode.\n", " render : bool\n", " Whether to render the episode during training.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " D = self.derived_variables\n", " total_rwd, n_steps = self._episode(max_steps, render, update=True)\n", " D[\"episode_num\"] += 1\n", " return total_rwd, n_steps\n", "\n", " def _episode(self, max_steps, render, update=True):\n", " \"\"\"\n", " Run or train the agent on an episode.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during training.\n", " update : bool\n", " Whether to perform the `Q` function backups after each step.\n", " Default is True.\n", "\n", " Returns\n", " -------\n", " reward : float\n", " The total reward on the episode.\n", " steps : float\n", " The number of steps taken on the episode.\n", " \"\"\"\n", " self.flush_history()\n", "\n", " obs = self.env.reset()\n", " env_model = self.parameters[\"model\"]\n", " HS, D = self.episode_history, self.derived_variables\n", "\n", " action = self.act(obs)\n", " s = self._obs2num[obs]\n", " a = self._action2num[action]\n", "\n", " # store initial (state, action) tuple\n", " HS[\"state_actions\"].append((s, a))\n", "\n", " total_reward, n_steps = 0.0, 0\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " # take action\n", " obs, reward, done, info = self.env.step(action)\n", " n_steps += 1\n", "\n", " # record rewards\n", " HS[\"rewards\"].append(reward)\n", " total_reward += reward\n", "\n", " # generate next state and action\n", " action = self.act(obs)\n", " s_ = self._obs2num[obs] if not done else None\n", " a_ = self._action2num[action]\n", "\n", " # update model\n", " env_model[(s, a, reward, s_)] += 1\n", "\n", " # update history counter\n", " for k in D[\"steps_since_last_visit\"].keys():\n", " D[\"steps_since_last_visit\"][k] += 1\n", " D[\"steps_since_last_visit\"][(s, a)] = 0\n", "\n", " if update:\n", " self.update()\n", "\n", " # store next (state, action) tuple\n", " HS[\"state_actions\"].append((s_, a_))\n", " s, a = s_, a_\n", "\n", " if done:\n", " break\n", "\n", " return total_reward, n_steps\n", "\n", " def greedy_policy(self, max_steps, render=True):\n", " \"\"\"\n", " Execute a deterministic greedy policy using the current agent\n", " parameters.\n", "\n", " Parameters\n", " ----------\n", " max_steps : int\n", " The maximum number of steps to run the episode.\n", " render : bool\n", " Whether to render the episode during execution.\n", "\n", " Returns\n", " -------\n", " total_reward : float\n", " The total reward on the episode.\n", " n_steps : float\n", " The total number of steps taken on the episode.\n", " \"\"\"\n", " self.flush_history()\n", "\n", " H = self.episode_history\n", " obs = self.env.reset()\n", "\n", " total_reward, n_steps = 0.0, 0\n", " for i in range(max_steps):\n", " if render:\n", " self.env.render()\n", "\n", " s = self._obs2num[obs]\n", " action = self._greedy(s)\n", " a = self._action2num[action]\n", "\n", " # store (state, action) tuple\n", " H[\"state_actions\"].append((s, a))\n", "\n", " # take action\n", " obs, reward, done, info = self.env.step(action)\n", " n_steps += 1\n", "\n", " # record rewards\n", " H[\"rewards\"].append(reward)\n", " total_reward += reward\n", "\n", " if done:\n", " break\n", "\n", " return total_reward, n_steps\n"]} {"path": "numpy_ml/rl_models/trainer.py", "content": ["from time import time\n", "import numpy as np\n", "\n", "\n", "class Trainer(object):\n", " def __init__(self, agent, env):\n", " \"\"\"\n", " An object to facilitate agent training and evaluation.\n", "\n", " Parameters\n", " ----------\n", " agent : :class:`AgentBase` instance\n", " The agent to train.\n", " env : ``gym.wrappers`` or ``gym.envs`` instance\n", " The environment to run the agent on.\n", " \"\"\"\n", " self.env = env\n", " self.agent = agent\n", " self.rewards = {\"total\": [], \"smooth_total\": [], \"n_steps\": [], \"duration\": []}\n", "\n", " def _train_episode(self, max_steps, render_every=None):\n", " t0 = time()\n", " if \"train_episode\" in dir(self.agent):\n", " # online training updates over the course of the episode\n", " reward, n_steps = self.agent.train_episode(max_steps)\n", " else:\n", " # offline training updates upon completion of the episode\n", " reward, n_steps = self.agent.run_episode(max_steps)\n", " self.agent.update()\n", " duration = time() - t0\n", " return reward, duration, n_steps\n", "\n", " def train(\n", " self,\n", " n_episodes,\n", " max_steps,\n", " seed=None,\n", " plot=True,\n", " verbose=True,\n", " render_every=None,\n", " smooth_factor=0.05,\n", " ):\n", " \"\"\"\n", " Train an agent on an OpenAI gym environment, logging training\n", " statistics along the way.\n", "\n", " Parameters\n", " ----------\n", " n_episodes : int\n", " The number of episodes to train the agent across.\n", " max_steps : int\n", " The maximum number of steps the agent can take on each episode.\n", " seed : int or None\n", " A seed for the random number generator. Default is None.\n", " plot : bool\n", " Whether to generate a plot of the cumulative reward as a function\n", " of training episode. Default is True.\n", " verbose : bool\n", " Whether to print intermediate run statistics to stdout during\n", " training. Default is True.\n", " smooth_factor : float in [0, 1]\n", " The amount to smooth the cumulative reward across episodes. Larger\n", " values correspond to less smoothing.\n", " \"\"\"\n", " if seed:\n", " np.random.seed(seed)\n", " self.env.seed(seed=seed)\n", "\n", " t0 = time()\n", " render_every = np.inf if render_every is None else render_every\n", " sf = smooth_factor\n", "\n", " for ep in range(n_episodes):\n", " tot_rwd, duration, n_steps = self._train_episode(max_steps)\n", " smooth_tot = tot_rwd if ep == 0 else (1 - sf) * smooth_tot + sf * tot_rwd\n", "\n", " if verbose:\n", " fstr = \"[Ep. {:2}] {:<6.2f} Steps | Total Reward: {:<7.2f}\"\n", " fstr += \" | Smoothed Total: {:<7.2f} | Duration: {:<6.2f}s\"\n", " print(fstr.format(ep + 1, n_steps, tot_rwd, smooth_tot, duration))\n", "\n", " if (ep + 1) % render_every == 0:\n", " fstr = \"\\tGreedy policy total reward: {:.2f}, n_steps: {:.2f}\"\n", " total, n_steps = self.agent.greedy_policy(max_steps)\n", " print(fstr.format(total, n_steps))\n", "\n", " self.rewards[\"total\"].append(tot_rwd)\n", " self.rewards[\"n_steps\"].append(n_steps)\n", " self.rewards[\"duration\"].append(duration)\n", " self.rewards[\"smooth_total\"].append(smooth_tot)\n", "\n", " train_time = (time() - t0) / 60\n", " fstr = \"Training took {:.2f} mins [{:.2f}s/episode]\"\n", " print(fstr.format(train_time, np.mean(self.rewards[\"duration\"])))\n", "\n", " rwd_greedy, n_steps = self.agent.greedy_policy(max_steps, render=False)\n", " fstr = \"Final greedy reward: {:.2f} | n_steps: {:.2f}\"\n", " print(fstr.format(rwd_greedy, n_steps))\n", "\n", " if plot:\n", " self.plot_rewards(rwd_greedy)\n", "\n", " def plot_rewards(self, rwd_greedy):\n", " \"\"\"\n", " Plot the cumulative reward per episode as a function of episode number.\n", "\n", " Notes\n", " -----\n", " Saves plot to the file ``./img/-.png``\n", "\n", " Parameters\n", " ----------\n", " rwd_greedy : float\n", " The cumulative reward earned with a final execution of a greedy\n", " target policy.\n", " \"\"\"\n", " try:\n", " import matplotlib.pyplot as plt\n", " import seaborn as sns\n", "\n", " # https://seaborn.pydata.org/generated/seaborn.set_context.html\n", " # https://seaborn.pydata.org/generated/seaborn.set_style.html\n", " sns.set_style(\"white\")\n", " sns.set_context(\"notebook\", font_scale=1)\n", " except:\n", " fstr = \"Error importing `matplotlib` and `seaborn` -- plotting functionality is disabled\"\n", " raise ImportError(fstr)\n", "\n", " R = self.rewards\n", " fig, ax = plt.subplots()\n", " x = np.arange(len(R[\"total\"]))\n", " y = R[\"smooth_total\"]\n", " y_raw = R[\"total\"]\n", "\n", " ax.plot(x, y, label=\"smoothed\")\n", " ax.plot(x, y_raw, alpha=0.5, label=\"raw\")\n", " ax.axhline(y=rwd_greedy, xmin=min(x), xmax=max(x), ls=\":\", label=\"final greedy\")\n", " ax.legend()\n", " sns.despine()\n", "\n", " env = self.agent.env_info[\"id\"]\n", " agent = self.agent.hyperparameters[\"agent\"]\n", "\n", " ax.set_xlabel(\"Episode\")\n", " ax.set_ylabel(\"Cumulative reward\")\n", " ax.set_title(\"{} on '{}'\".format(agent, env))\n", " plt.savefig(\"img/{}-{}.png\".format(agent, env))\n", " plt.close(\"all\")\n"]} {"path": "numpy_ml/rl_models/rl_utils.py", "content": ["\"\"\"Utilities for training and evaluating RL models on OpenAI gym environments\"\"\"\n", "import warnings\n", "from itertools import product\n", "from collections import defaultdict\n", "\n", "import numpy as np\n", "\n", "from numpy_ml.utils.testing import DependencyWarning\n", "from numpy_ml.rl_models.tiles.tiles3 import tiles, IHT\n", "\n", "NO_PD = False\n", "try:\n", " import pandas as pd\n", "except ModuleNotFoundError:\n", " NO_PD = True\n", "\n", "try:\n", " import gym\n", "except ModuleNotFoundError:\n", " fstr = (\n", " \"Agents in `numpy_ml.rl_models` use the OpenAI gym for training. \"\n", " \"To install the gym environments, run `pip install gym`. For more\"\n", " \" information, see https://github.com/openai/gym.\"\n", " )\n", " warnings.warn(fstr, DependencyWarning)\n", "\n", "\n", "class EnvModel(object):\n", " \"\"\"\n", " A simple tabular environment model that maintains the counts of each\n", " reward-outcome pair given the state and action that preceded them. The\n", " model can be queried with\n", "\n", " >>> M = EnvModel()\n", " >>> M[(state, action, reward, next_state)] += 1\n", " >>> M[(state, action, reward, next_state)]\n", " 1\n", " >>> M.state_action_pairs()\n", " [(state, action)]\n", " >>> M.outcome_probs(state, action)\n", " [(next_state, 1)]\n", " \"\"\"\n", "\n", " def __init__(self):\n", " super(EnvModel, self).__init__()\n", " self._model = defaultdict(lambda: defaultdict(lambda: 0))\n", "\n", " def __setitem__(self, key, value):\n", " \"\"\"Set self[key] to value\"\"\"\n", " s, a, r, s_ = key\n", " self._model[(s, a)][(r, s_)] = value\n", "\n", " def __getitem__(self, key):\n", " \"\"\"Return the value associated with key\"\"\"\n", " s, a, r, s_ = key\n", " return self._model[(s, a)][(r, s_)]\n", "\n", " def __contains__(self, key):\n", " \"\"\"True if EnvModel contains `key`, else False\"\"\"\n", " s, a, r, s_ = key\n", " p1 = (s, a) in self.state_action_pairs()\n", " p2 = (r, s_) in self.reward_outcome_pairs()\n", " return p1 and p2\n", "\n", " def state_action_pairs(self):\n", " \"\"\"Return all (state, action) pairs in the environment model\"\"\"\n", " return list(self._model.keys())\n", "\n", " def reward_outcome_pairs(self, s, a):\n", " \"\"\"\n", " Return all (reward, next_state) pairs associated with taking action `a`\n", " in state `s`.\n", " \"\"\"\n", " return list(self._model[(s, a)].keys())\n", "\n", " def outcome_probs(self, s, a):\n", " \"\"\"\n", " Return the probability under the environment model of each outcome\n", " state after taking action `a` in state `s`.\n", "\n", " Parameters\n", " ----------\n", " s : int as returned by ``self._obs2num``\n", " The id for the state/observation.\n", " a : int as returned by ``self._action2num``\n", " The id for the action taken from state `s`.\n", "\n", " Returns\n", " -------\n", " outcome_probs : list of (state, prob) tuples\n", " A list of each possible outcome and its associated probability\n", " under the model.\n", " \"\"\"\n", " items = list(self._model[(s, a)].items())\n", " total_count = np.sum([c for (_, c) in items])\n", " outcome_probs = [c / total_count for (_, c) in items]\n", " outcomes = [p for (p, _) in items]\n", " return list(zip(outcomes, outcome_probs))\n", "\n", " def state_action_pairs_leading_to_outcome(self, outcome):\n", " \"\"\"\n", " Return all (state, action) pairs that have a nonzero probability of\n", " producing `outcome` under the current model.\n", "\n", " Parameters\n", " ----------\n", " outcome : int\n", " The outcome state.\n", "\n", " Returns\n", " -------\n", " pairs : list of (state, action) tuples\n", " A list of all (state, action) pairs with a nonzero probability of\n", " producing `outcome` under the model.\n", " \"\"\"\n", " pairs = []\n", " for sa in self.state_action_pairs():\n", " outcomes = [o for (r, o) in self.reward_outcome_pairs(*sa)]\n", " if outcome in outcomes:\n", " pairs.append(sa)\n", " return pairs\n", "\n", "\n", "def tile_state_space(\n", " env,\n", " env_stats,\n", " n_tilings,\n", " obs_max=None,\n", " obs_min=None,\n", " state_action=False,\n", " grid_size=(4, 4),\n", "):\n", " \"\"\"\n", " Return a function to encode the continous observations generated by `env`\n", " in terms of a collection of `n_tilings` overlapping tilings (each with\n", " dimension `grid_size`) of the state space.\n", "\n", " Arguments\n", " ---------\n", " env : ``gym.wrappers.time_limit.TimeLimit`` instance\n", " An openAI environment.\n", " n_tilings : int\n", " The number of overlapping tilings to use. Should be a power of 2. This\n", " determines the dimension of the discretized tile-encoded state vector.\n", " obs_max : float or np.ndarray\n", " The value to treat as the max value of the observation space when\n", " calculating the grid widths. If None, use\n", " ``env.observation_space.high``. Default is None.\n", " obs_min : float or np.ndarray\n", " The value to treat as the min value of the observation space when\n", " calculating the grid widths. If None, use\n", " ``env.observation_space.low``. Default is None.\n", " state_action : bool\n", " Whether to use tile coding to encode state-action values (True) or just\n", " state values (False). Default is False.\n", " grid_size : list of length 2\n", " A list of ints representing the coarseness of the tilings. E.g., a\n", " `grid_size` of [4, 4] would mean each tiling consisted of a 4x4 tile\n", " grid. Default is [4, 4].\n", "\n", " Returns\n", " -------\n", " encode_obs_as_tile : function\n", " A function which takes as input continous observation vector and\n", " returns a set of the indices of the active tiles in the tile coded\n", " observation space.\n", " n_states : int\n", " An integer reflecting the total number of unique states possible under\n", " this tile coding regimen.\n", " \"\"\"\n", " obs_max = np.nan_to_num(env.observation_space.high) if obs_max is None else obs_max\n", " obs_min = np.nan_to_num(env.observation_space.low) if obs_min is None else obs_min\n", "\n", " if state_action:\n", " if env_stats[\"tuple_action\"]:\n", " n = [space.n - 1.0 for space in env.action_spaces.spaces]\n", " else:\n", " n = [env.action_space.n]\n", "\n", " obs_max = np.concatenate([obs_max, n])\n", " obs_min = np.concatenate([obs_min, np.zeros_like(n)])\n", "\n", " obs_range = obs_max - obs_min\n", " scale = 1.0 / obs_range\n", "\n", " # scale (state-)observation vector\n", " scale_obs = lambda obs: obs * scale # noqa: E731\n", "\n", " n_tiles = np.prod(grid_size) * n_tilings\n", " n_states = np.prod([n_tiles - i for i in range(n_tilings)])\n", " iht = IHT(16384)\n", "\n", " def encode_obs_as_tile(obs):\n", " obs = scale_obs(obs)\n", " return tuple(tiles(iht, n_tilings, obs))\n", "\n", " return encode_obs_as_tile, n_states\n", "\n", "\n", "def get_gym_environs():\n", " \"\"\"List all valid OpenAI ``gym`` environment ids\"\"\"\n", " return [e.id for e in gym.envs.registry.all()]\n", "\n", "\n", "def get_gym_stats():\n", " \"\"\"Return a pandas DataFrame of the environment IDs.\"\"\"\n", " df = []\n", " for e in gym.envs.registry.all():\n", " print(e.id)\n", " df.append(env_stats(gym.make(e.id)))\n", " cols = [\n", " \"id\",\n", " \"continuous_actions\",\n", " \"continuous_observations\",\n", " \"action_dim\",\n", " # \"action_ids\",\n", " \"deterministic\",\n", " \"multidim_actions\",\n", " \"multidim_observations\",\n", " \"n_actions_per_dim\",\n", " \"n_obs_per_dim\",\n", " \"obs_dim\",\n", " # \"obs_ids\",\n", " \"seed\",\n", " \"tuple_actions\",\n", " \"tuple_observations\",\n", " ]\n", " return df if NO_PD else pd.DataFrame(df)[cols]\n", "\n", "\n", "def is_tuple(env):\n", " \"\"\"\n", " Check if the action and observation spaces for `env` are instances of\n", " ``gym.spaces.Tuple`` or ``gym.spaces.Dict``.\n", "\n", " Notes\n", " -----\n", " A tuple space is a tuple of *several* (possibly multidimensional)\n", " action/observation spaces. For our purposes, a tuple space is necessarily\n", " multidimensional.\n", "\n", " Returns\n", " -------\n", " tuple_action : bool\n", " Whether the `env`'s action space is an instance of ``gym.spaces.Tuple``\n", " or ``gym.spaces.Dict``.\n", " tuple_obs : bool\n", " Whether the `env`'s observation space is an instance of\n", " ``gym.spaces.Tuple`` or ``gym.spaces.Dict``.\n", " \"\"\"\n", " tuple_space, dict_space = gym.spaces.Tuple, gym.spaces.dict.Dict\n", " tuple_action = isinstance(env.action_space, (tuple_space, dict_space))\n", " tuple_obs = isinstance(env.observation_space, (tuple_space, dict_space))\n", " return tuple_action, tuple_obs\n", "\n", "\n", "def is_multidimensional(env):\n", " \"\"\"\n", " Check if the action and observation spaces for `env` are multidimensional\n", " or ``Tuple`` spaces.\n", "\n", " Notes\n", " -----\n", " A multidimensional space is any space whose actions / observations have\n", " more than one element in them. This includes ``Tuple`` spaces, but also\n", " includes single action/observation spaces with several dimensions.\n", "\n", " Parameters\n", " ----------\n", " env : ``gym.wrappers`` or ``gym.envs`` instance\n", " The environment to evaluate.\n", "\n", " Returns\n", " -------\n", " md_action : bool\n", " Whether the `env`'s action space is multidimensional.\n", " md_obs : bool\n", " Whether the `env`'s observation space is multidimensional.\n", " tuple_action : bool\n", " Whether the `env`'s action space is a ``Tuple`` instance.\n", " tuple_obs : bool\n", " Whether the `env`'s observation space is a ``Tuple`` instance.\n", " \"\"\"\n", " md_action, md_obs = True, True\n", " tuple_action, tuple_obs = is_tuple(env)\n", " if not tuple_action:\n", " act = env.action_space.sample()\n", " md_action = isinstance(act, (list, tuple, np.ndarray)) and len(act) > 1\n", "\n", " if not tuple_obs:\n", " OS = env.observation_space\n", " obs = OS.low if \"low\" in dir(OS) else OS.sample() # sample causes problems\n", " md_obs = isinstance(obs, (list, tuple, np.ndarray)) and len(obs) > 1\n", " return md_action, md_obs, tuple_action, tuple_obs\n", "\n", "\n", "def is_continuous(env, tuple_action, tuple_obs):\n", " \"\"\"\n", " Check if an `env`'s observation and action spaces are continuous.\n", "\n", " Parameters\n", " ----------\n", " env : ``gym.wrappers`` or ``gym.envs`` instance\n", " The environment to evaluate.\n", " tuple_action : bool\n", " Whether the `env`'s action space is an instance of `gym.spaces.Tuple`\n", " or `gym.spaces.Dict`.\n", " tuple_obs : bool\n", " Whether the `env`'s observation space is an instance of `gym.spaces.Tuple`\n", " or `gym.spaces.Dict`.\n", "\n", " Returns\n", " -------\n", " cont_action : bool\n", " Whether the `env`'s action space is continuous.\n", " cont_obs : bool\n", " Whether the `env`'s observation space is continuous.\n", " \"\"\"\n", " Continuous = gym.spaces.box.Box\n", " if tuple_obs:\n", " spaces = env.observation_space.spaces\n", " cont_obs = all(isinstance(s, Continuous) for s in spaces)\n", " else:\n", " cont_obs = isinstance(env.observation_space, Continuous)\n", "\n", " if tuple_action:\n", " spaces = env.action_space.spaces\n", " cont_action = all(isinstance(s, Continuous) for s in spaces)\n", " else:\n", " cont_action = isinstance(env.action_space, Continuous)\n", " return cont_action, cont_obs\n", "\n", "\n", "def action_stats(env, md_action, cont_action):\n", " \"\"\"\n", " Get information on `env`'s action space.\n", "\n", " Parameters\n", " ----------\n", " md_action : bool\n", " Whether the `env`'s action space is multidimensional.\n", " cont_action : bool\n", " Whether the `env`'s action space is continuous.\n", "\n", " Returns\n", " -------\n", " n_actions_per_dim : list of length (action_dim,)\n", " The number of possible actions for each dimension of the action space.\n", " action_ids : list or None\n", " A list of all valid actions within the space. If `cont_action` is\n", " True, this value will be None.\n", " action_dim : int or None\n", " The number of dimensions in a single action.\n", " \"\"\"\n", " if cont_action:\n", " action_dim = 1\n", " action_ids = None\n", " n_actions_per_dim = [np.inf]\n", "\n", " if md_action:\n", " action_dim = env.action_space.shape[0]\n", " n_actions_per_dim = [np.inf for _ in range(action_dim)]\n", " else:\n", " if md_action:\n", " n_actions_per_dim = [\n", " space.n if hasattr(space, \"n\") else np.inf\n", " for space in env.action_space.spaces\n", " ]\n", " action_ids = (\n", " None\n", " if np.inf in n_actions_per_dim\n", " else list(product(*[range(i) for i in n_actions_per_dim]))\n", " )\n", " action_dim = len(n_actions_per_dim)\n", " else:\n", " action_dim = 1\n", " n_actions_per_dim = [env.action_space.n]\n", " action_ids = list(range(n_actions_per_dim[0]))\n", " return n_actions_per_dim, action_ids, action_dim\n", "\n", "\n", "def obs_stats(env, md_obs, cont_obs):\n", " \"\"\"\n", " Get information on the observation space for `env`.\n", "\n", " Parameters\n", " ----------\n", " env : ``gym.wrappers`` or ``gym.envs`` instance\n", " The environment to evaluate.\n", " md_obs : bool\n", " Whether the `env`'s action space is multidimensional.\n", " cont_obs : bool\n", " Whether the `env`'s observation space is multidimensional.\n", "\n", " Returns\n", " -------\n", " n_obs_per_dim : list of length (obs_dim,)\n", " The number of possible observation classes for each dimension of the\n", " observation space.\n", " obs_ids : list or None\n", " A list of all valid observations within the space. If `cont_obs` is\n", " True, this value will be None.\n", " obs_dim : int or None\n", " The number of dimensions in a single observation.\n", " \"\"\"\n", " if cont_obs:\n", " obs_ids = None\n", " obs_dim = env.observation_space.shape[0]\n", " n_obs_per_dim = [np.inf for _ in range(obs_dim)]\n", " else:\n", " if md_obs:\n", " n_obs_per_dim = [\n", " space.n if hasattr(space, \"n\") else np.inf\n", " for space in env.observation_space.spaces\n", " ]\n", " obs_ids = (\n", " None\n", " if np.inf in n_obs_per_dim\n", " else list(product(*[range(i) for i in n_obs_per_dim]))\n", " )\n", " obs_dim = len(n_obs_per_dim)\n", " else:\n", " obs_dim = 1\n", " n_obs_per_dim = [env.observation_space.n]\n", " obs_ids = list(range(n_obs_per_dim[0]))\n", "\n", " return n_obs_per_dim, obs_ids, obs_dim\n", "\n", "\n", "def env_stats(env):\n", " \"\"\"\n", " Compute statistics for the current environment.\n", "\n", " Parameters\n", " ----------\n", " env : ``gym.wrappers`` or ``gym.envs`` instance\n", " The environment to evaluate.\n", "\n", " Returns\n", " -------\n", " env_info : dict\n", " A dictionary containing information about the action and observation\n", " spaces of `env`.\n", " \"\"\"\n", " md_action, md_obs, tuple_action, tuple_obs = is_multidimensional(env)\n", " cont_action, cont_obs = is_continuous(env, tuple_action, tuple_obs)\n", "\n", " n_actions_per_dim, action_ids, action_dim = action_stats(\n", " env, md_action, cont_action,\n", " )\n", " n_obs_per_dim, obs_ids, obs_dim = obs_stats(env, md_obs, cont_obs)\n", "\n", " env_info = {\n", " \"id\": env.spec.id,\n", " \"seed\": env.spec.seed if \"seed\" in dir(env.spec) else None,\n", " \"deterministic\": bool(~env.spec.nondeterministic),\n", " \"tuple_actions\": tuple_action,\n", " \"tuple_observations\": tuple_obs,\n", " \"multidim_actions\": md_action,\n", " \"multidim_observations\": md_obs,\n", " \"continuous_actions\": cont_action,\n", " \"continuous_observations\": cont_obs,\n", " \"n_actions_per_dim\": n_actions_per_dim,\n", " \"action_dim\": action_dim,\n", " \"n_obs_per_dim\": n_obs_per_dim,\n", " \"obs_dim\": obs_dim,\n", " \"action_ids\": action_ids,\n", " \"obs_ids\": obs_ids,\n", " }\n", "\n", " return env_info\n"]} {"path": "numpy_ml/rl_models/tiles/__init__.py", "content": ["from . import tiles3\n"]} {"path": "numpy_ml/rl_models/tiles/tiles3.py", "content": ["\"\"\"\n", "Tile Coding Software version 3.0beta\n", "by Rich Sutton\n", "based on a program created by Steph Schaeffer and others\n", "External documentation and recommendations on the use of this code is available in the\n", "reinforcement learning textbook by Sutton and Barto, and on the web.\n", "These need to be understood before this code is.\n", "\n", "This software is for Python 3 or more.\n", "\n", "This is an implementation of grid-style tile codings, based originally on\n", "the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed.\n", "Here we provide a function, \"tiles\", that maps floating and integer\n", "variables to a list of tiles, and a second function \"tiles-wrap\" that does the same while\n", "wrapping some floats to provided widths (the lower wrap value is always 0).\n", "\n", "The float variables will be gridded at unit intervals, so generalization\n", "will be by approximately 1 in each direction, and any scaling will have\n", "to be done externally before calling tiles.\n", "\n", "Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should\n", "also be greater than or equal to four times the number of floats.\n", "\n", "The first argument is either an index hash table of a given size (created by (make-iht size)),\n", "an integer \"size\" (range of the indices from 0), or nil (for testing, indicating that the tile\n", "coordinates are to be returned without being converted to indices).\n", "\"\"\"\n", "\n", "from math import floor\n", "from itertools import zip_longest\n", "\n", "\n", "basehash = hash\n", "\n", "\n", "class IHT:\n", " \"Structure to handle collisions\"\n", "\n", " def __init__(self, sizeval):\n", " self.size = sizeval\n", " self.overfullCount = 0\n", " self.dictionary = {}\n", "\n", " def __str__(self):\n", " \"Prepares a string for printing whenever this object is printed\"\n", " return (\n", " \"Collision table:\"\n", " + \" size:\"\n", " + str(self.size)\n", " + \" overfullCount:\"\n", " + str(self.overfullCount)\n", " + \" dictionary:\"\n", " + str(len(self.dictionary))\n", " + \" items\"\n", " )\n", "\n", " def count(self):\n", " return len(self.dictionary)\n", "\n", " def fullp(self):\n", " return len(self.dictionary) >= self.size\n", "\n", " def getindex(self, obj, readonly=False):\n", " d = self.dictionary\n", " if obj in d:\n", " return d[obj]\n", " elif readonly:\n", " return None\n", " size = self.size\n", " count = self.count()\n", " if count >= size:\n", " if self.overfullCount == 0:\n", " print(\"IHT full, starting to allow collisions\")\n", " self.overfullCount += 1\n", " return basehash(obj) % self.size\n", " else:\n", " d[obj] = count\n", " return count\n", "\n", "\n", "def hashcoords(coordinates, m, readonly=False):\n", " if type(m) == IHT:\n", " return m.getindex(tuple(coordinates), readonly)\n", " if type(m) == int:\n", " return basehash(tuple(coordinates)) % m\n", " if m == None:\n", " return coordinates\n", "\n", "\n", "def tiles(ihtORsize, numtilings, floats, ints=[], readonly=False):\n", " \"\"\"returns num-tilings tile indices corresponding to the floats and ints\"\"\"\n", " qfloats = [floor(f * numtilings) for f in floats]\n", " Tiles = []\n", " for tiling in range(numtilings):\n", " tilingX2 = tiling * 2\n", " coords = [tiling]\n", " b = tiling\n", " for q in qfloats:\n", " coords.append((q + b) // numtilings)\n", " b += tilingX2\n", " coords.extend(ints)\n", " Tiles.append(hashcoords(coords, ihtORsize, readonly))\n", " return Tiles\n", "\n", "\n", "def tileswrap(ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False):\n", " \"\"\"returns num-tilings tile indices corresponding to the floats and ints,\n", " wrapping some floats\"\"\"\n", " qfloats = [floor(f * numtilings) for f in floats]\n", " Tiles = []\n", " for tiling in range(numtilings):\n", " tilingX2 = tiling * 2\n", " coords = [tiling]\n", " b = tiling\n", " for q, width in zip_longest(qfloats, wrapwidths):\n", " c = (q + b % numtilings) // numtilings\n", " coords.append(c % width if width else c)\n", " b += tilingX2\n", " coords.extend(ints)\n", " Tiles.append(hashcoords(coords, ihtORsize, readonly))\n", " return Tiles\n"]} {"path": "numpy_ml/preprocessing/dsp.py", "content": ["import numpy as np\n", "from numpy.lib.stride_tricks import as_strided\n", "\n", "from ..utils.windows import WindowInitializer\n", "\n", "#######################################################################\n", "# Signal Resampling #\n", "#######################################################################\n", "\n", "\n", "def batch_resample(X, new_dim, mode=\"bilinear\"):\n", " \"\"\"\n", " Resample each image (or similar grid-based 2D signal) in a batch to\n", " `new_dim` using the specified resampling strategy.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(n_ex, in_rows, in_cols, in_channels)`\n", " An input image volume\n", " new_dim : 2-tuple of `(out_rows, out_cols)`\n", " The dimension to resample each image to\n", " mode : {'bilinear', 'neighbor'}\n", " The resampling strategy to employ. Default is 'bilinear'.\n", "\n", " Returns\n", " -------\n", " resampled : :py:class:`ndarray ` of shape `(n_ex, out_rows, out_cols, in_channels)`\n", " The resampled image volume.\n", " \"\"\"\n", " if mode == \"bilinear\":\n", " interpolate = bilinear_interpolate\n", " elif mode == \"neighbor\":\n", " interpolate = nn_interpolate_2D\n", " else:\n", " raise NotImplementedError(\"Unrecognized resampling mode: {}\".format(mode))\n", "\n", " out_rows, out_cols = new_dim\n", " n_ex, in_rows, in_cols, n_in = X.shape\n", "\n", " # compute coordinates to resample\n", " x = np.tile(np.linspace(0, in_cols - 2, out_cols), out_rows)\n", " y = np.repeat(np.linspace(0, in_rows - 2, out_rows), out_cols)\n", "\n", " # resample each image\n", " resampled = []\n", " for i in range(n_ex):\n", " r = interpolate(X[i, ...], x, y)\n", " r = r.reshape(out_rows, out_cols, n_in)\n", " resampled.append(r)\n", " return np.dstack(resampled)\n", "\n", "\n", "def nn_interpolate_2D(X, x, y):\n", " \"\"\"\n", " Estimates of the pixel values at the coordinates (x, y) in `X` using a\n", " nearest neighbor interpolation strategy.\n", "\n", " Notes\n", " -----\n", " Assumes the current entries in `X` reflect equally-spaced samples from a 2D\n", " integer grid.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(in_rows, in_cols, in_channels)`\n", " An input image sampled along a grid of `in_rows` by `in_cols`.\n", " x : list of length `k`\n", " A list of x-coordinates for the samples we wish to generate\n", " y : list of length `k`\n", " A list of y-coordinates for the samples we wish to generate\n", "\n", " Returns\n", " -------\n", " samples : :py:class:`ndarray ` of shape `(k, in_channels)`\n", " The samples for each (x,y) coordinate computed via nearest neighbor\n", " interpolation\n", " \"\"\"\n", " nx, ny = np.around(x), np.around(y)\n", " nx = np.clip(nx, 0, X.shape[1] - 1).astype(int)\n", " ny = np.clip(ny, 0, X.shape[0] - 1).astype(int)\n", " return X[ny, nx, :]\n", "\n", "\n", "def nn_interpolate_1D(X, t):\n", " \"\"\"\n", " Estimates of the signal values at `X[t]` using a nearest neighbor\n", " interpolation strategy.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(in_length, in_channels)`\n", " An input image sampled along an integer `in_length`\n", " t : list of length `k`\n", " A list of coordinates for the samples we wish to generate\n", "\n", " Returns\n", " -------\n", " samples : :py:class:`ndarray ` of shape `(k, in_channels)`\n", " The samples for each (x,y) coordinate computed via nearest neighbor\n", " interpolation\n", " \"\"\"\n", " nt = np.clip(np.around(t), 0, X.shape[0] - 1).astype(int)\n", " return X[nt, :]\n", "\n", "\n", "def bilinear_interpolate(X, x, y):\n", " \"\"\"\n", " Estimates of the pixel values at the coordinates (x, y) in `X` via bilinear\n", " interpolation.\n", "\n", " Notes\n", " -----\n", " Assumes the current entries in X reflect equally-spaced\n", " samples from a 2D integer grid.\n", "\n", " Modified from https://bit.ly/2NMb1Dr\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(in_rows, in_cols, in_channels)`\n", " An input image sampled along a grid of `in_rows` by `in_cols`.\n", " x : list of length `k`\n", " A list of x-coordinates for the samples we wish to generate\n", " y : list of length `k`\n", " A list of y-coordinates for the samples we wish to generate\n", "\n", " Returns\n", " -------\n", " samples : list of length `(k, in_channels)`\n", " The samples for each (x,y) coordinate computed via bilinear\n", " interpolation\n", " \"\"\"\n", " x0 = np.floor(x).astype(int)\n", " y0 = np.floor(y).astype(int)\n", " x1 = x0 + 1\n", " y1 = y0 + 1\n", "\n", " x0 = np.clip(x0, 0, X.shape[1] - 1)\n", " y0 = np.clip(y0, 0, X.shape[0] - 1)\n", " x1 = np.clip(x1, 0, X.shape[1] - 1)\n", " y1 = np.clip(y1, 0, X.shape[0] - 1)\n", "\n", " Ia = X[y0, x0, :].T\n", " Ib = X[y1, x0, :].T\n", " Ic = X[y0, x1, :].T\n", " Id = X[y1, x1, :].T\n", "\n", " wa = (x1 - x) * (y1 - y)\n", " wb = (x1 - x) * (y - y0)\n", " wc = (x - x0) * (y1 - y)\n", " wd = (x - x0) * (y - y0)\n", "\n", " return (Ia * wa).T + (Ib * wb).T + (Ic * wc).T + (Id * wd).T\n", "\n", "\n", "#######################################################################\n", "# Fourier Decomposition #\n", "#######################################################################\n", "\n", "\n", "def DCT(frame, orthonormal=True):\n", " \"\"\"\n", " A naive :math:`O(N^2)` implementation of the 1D discrete cosine transform-II\n", " (DCT-II).\n", "\n", " Notes\n", " -----\n", " For a signal :math:`\\mathbf{x} = [x_1, \\ldots, x_N]` consisting of `N`\n", " samples, the `k` th DCT coefficient, :math:`c_k`, is\n", "\n", " .. math::\n", "\n", " c_k = 2 \\sum_{n=0}^{N-1} x_n \\cos(\\pi k (2 n + 1) / (2 N))\n", "\n", " where `k` ranges from :math:`0, \\ldots, N-1`.\n", "\n", " The DCT is highly similar to the DFT -- whereas in a DFT the basis\n", " functions are sinusoids, in a DCT they are restricted solely to cosines. A\n", " signal's DCT representation tends to have more of its energy concentrated\n", " in a smaller number of coefficients when compared to the DFT, and is thus\n", " commonly used for signal compression. [1]\n", "\n", " .. [1] Smoother signals can be accurately approximated using fewer DFT / DCT\n", " coefficients, resulting in a higher compression ratio. The DCT naturally\n", " yields a continuous extension at the signal boundaries due its use of\n", " even basis functions (cosine). This in turn produces a smoother\n", " extension in comparison to DFT or DCT approximations, resulting in a\n", " higher compression.\n", "\n", " Parameters\n", " ----------\n", " frame : :py:class:`ndarray ` of shape `(N,)`\n", " A signal frame consisting of N samples\n", " orthonormal : bool\n", " Scale to ensure the coefficient vector is orthonormal. Default is True.\n", "\n", " Returns\n", " -------\n", " dct : :py:class:`ndarray ` of shape `(N,)`\n", " The discrete cosine transform of the samples in `frame`.\n", " \"\"\"\n", " N = len(frame)\n", " out = np.zeros_like(frame)\n", " for k in range(N):\n", " for (n, xn) in enumerate(frame):\n", " out[k] += xn * np.cos(np.pi * k * (2 * n + 1) / (2 * N))\n", " scale = np.sqrt(1 / (4 * N)) if k == 0 else np.sqrt(1 / (2 * N))\n", " out[k] *= 2 * scale if orthonormal else 2\n", " return out\n", "\n", "\n", "def __DCT2(frame):\n", " \"\"\"Currently broken\"\"\"\n", " N = len(frame) # window length\n", "\n", " k = np.arange(N, dtype=float)\n", " F = k.reshape(1, -1) * k.reshape(-1, 1)\n", " K = np.divide(F, k, out=np.zeros_like(F), where=F != 0)\n", "\n", " FC = np.cos(F * np.pi / N + K * np.pi / 2 * N)\n", " return 2 * (FC @ frame)\n", "\n", "\n", "def DFT(frame, positive_only=True):\n", " \"\"\"\n", " A naive :math:`O(N^2)` implementation of the 1D discrete Fourier transform (DFT).\n", "\n", " Notes\n", " -----\n", " The Fourier transform decomposes a signal into a linear combination of\n", " sinusoids (ie., basis elements in the space of continuous periodic\n", " functions). For a sequence :math:`\\mathbf{x} = [x_1, \\ldots, x_N]` of N\n", " evenly spaced samples, the `k` th DFT coefficient is given by:\n", "\n", " .. math::\n", "\n", " c_k = \\sum_{n=0}^{N-1} x_n \\exp(-2 \\pi i k n / N)\n", "\n", " where `i` is the imaginary unit, `k` is an index ranging from `0, ..., N-1`,\n", " and :math:`X_k` is the complex coefficient representing the phase\n", " (imaginary part) and amplitude (real part) of the `k` th sinusoid in the\n", " DFT spectrum. The frequency of the `k` th sinusoid is :math:`(k 2 \\pi / N)`\n", " radians per sample.\n", "\n", " When applied to a real-valued input, the negative frequency terms are the\n", " complex conjugates of the positive-frequency terms and the overall spectrum\n", " is symmetric (excluding the first index, which contains the zero-frequency\n", " / intercept term).\n", "\n", " Parameters\n", " ----------\n", " frame : :py:class:`ndarray ` of shape `(N,)`\n", " A signal frame consisting of N samples\n", " positive_only : bool\n", " Whether to only return the coefficients for the positive frequency\n", " terms. Default is True.\n", "\n", " Returns\n", " -------\n", " spectrum : :py:class:`ndarray ` of shape `(N,)` or `(N // 2 + 1,)` if `real_only`\n", " The coefficients of the frequency spectrum for `frame`, including\n", " imaginary components.\n", " \"\"\"\n", " N = len(frame) # window length\n", "\n", " # F[i,j] = coefficient for basis vector i, timestep j (i.e., k * n)\n", " F = np.arange(N).reshape(1, -1) * np.arange(N).reshape(-1, 1)\n", " F = np.exp(F * (-1j * 2 * np.pi / N))\n", "\n", " # vdot only operates on vectors (rather than ndarrays), so we have to\n", " # loop over each basis vector in F explicitly\n", " spectrum = np.array([np.vdot(f, frame) for f in F])\n", " return spectrum[: (N // 2) + 1] if positive_only else spectrum\n", "\n", "\n", "def dft_bins(N, fs=44000, positive_only=True):\n", " \"\"\"\n", " Calc the frequency bin centers for a DFT with `N` coefficients.\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The number of frequency bins in the DFT\n", " fs : int\n", " The sample rate/frequency of the signal (in Hz). Default is 44000.\n", " positive_only : bool\n", " Whether to only return the bins for the positive frequency\n", " terms. Default is True.\n", "\n", " Returns\n", " -------\n", " bins : :py:class:`ndarray ` of shape `(N,)` or `(N // 2 + 1,)` if `positive_only`\n", " The frequency bin centers associated with each coefficient in the\n", " DFT spectrum\n", " \"\"\"\n", " if positive_only:\n", " freq_bins = np.linspace(0, fs / 2, 1 + N // 2, endpoint=True)\n", " else:\n", " l, r = (1 + (N - 1) / 2, (1 - N) / 2) if N % 2 else (N / 2, -N / 2)\n", " freq_bins = np.r_[np.arange(l), np.arange(r, 0)] * fs / N\n", " return freq_bins\n", "\n", "\n", "def magnitude_spectrum(frames):\n", " \"\"\"\n", " Compute the magnitude spectrum (i.e., absolute value of the DFT spectrum)\n", " for each frame in `frames`. Assumes each frame is real-valued only.\n", "\n", " Parameters\n", " ----------\n", " frames : :py:class:`ndarray ` of shape `(M, N)`\n", " A sequence of `M` frames each consisting of `N` samples\n", "\n", " Returns\n", " -------\n", " magnitude_spec : :py:class:`ndarray ` of shape `(M, N // 2 + 1)`\n", " The magnitude spectrum for each frame in `frames`. Only includes the\n", " coefficients for the positive spectrum frequencies.\n", " \"\"\"\n", " return np.vstack([np.abs(DFT(frame, positive_only=True)) for frame in frames])\n", "\n", "\n", "def power_spectrum(frames, scale=False):\n", " \"\"\"\n", " Compute the power spectrum for a signal represented as a collection of\n", " frames. Assumes each frame is real-valued only.\n", "\n", " The power spectrum is simply the square of the magnitude spectrum, possibly\n", " scaled by the number of FFT bins. It measures how the energy of the signal\n", " is distributed over the frequency domain.\n", "\n", " Parameters\n", " ----------\n", " frames : :py:class:`ndarray ` of shape `(M, N)`\n", " A sequence of `M` frames each consisting of `N` samples\n", " scale : bool\n", " Whether the scale by the number of DFT bins. Default is False.\n", "\n", " Returns\n", " -------\n", " power_spec : :py:class:`ndarray ` of shape `(M, N // 2 + 1)`\n", " The power spectrum for each frame in `frames`. Only includes the\n", " coefficients for the positive spectrum frequencies.\n", " \"\"\"\n", " scaler = frames.shape[1] // 2 + 1 if scale else 1\n", " return (1 / scaler) * magnitude_spectrum(frames) ** 2\n", "\n", "\n", "#######################################################################\n", "# Preprocessing Utils #\n", "#######################################################################\n", "\n", "\n", "def to_frames(x, frame_width, stride, writeable=False):\n", " \"\"\"\n", " Convert a 1D signal x into overlapping windows of width `frame_width` using\n", " a hop length of `stride`.\n", "\n", " Notes\n", " -----\n", " If ``(len(x) - frame_width) % stride != 0`` then some number of the samples\n", " in x will be dropped. Specifically::\n", "\n", " n_dropped_frames = len(x) - frame_width - stride * (n_frames - 1)\n", "\n", " where::\n", "\n", " n_frames = (len(x) - frame_width) // stride + 1\n", "\n", " This method uses low-level stride manipulation to avoid creating an\n", " additional copy of `x`. The downside is that if ``writeable`=True``,\n", " modifying the `frame` output can result in unexpected behavior:\n", "\n", " >>> out = to_frames(np.arange(6), 5, 1)\n", " >>> out\n", " array([[0, 1, 2, 3, 4],\n", " [1, 2, 3, 4, 5]])\n", " >>> out[0, 1] = 99\n", " >>> out\n", " array([[ 0, 99, 2, 3, 4],\n", " [99, 2, 3, 4, 5]])\n", "\n", " Parameters\n", " ----------\n", " x : :py:class:`ndarray ` of shape `(N,)`\n", " A 1D signal consisting of N samples\n", " frame_width : int\n", " The width of a single frame window in samples\n", " stride : int\n", " The hop size / number of samples advanced between consecutive frames\n", " writeable : bool\n", " If set to False, the returned array will be readonly. Otherwise it will\n", " be writable if `x` was. It is advisable to set this to False whenever\n", " possible to avoid unexpected behavior (see NB 2 above). Default is False.\n", "\n", " Returns\n", " -------\n", " frame: :py:class:`ndarray ` of shape `(n_frames, frame_width)`\n", " The collection of overlapping frames stacked into a matrix\n", " \"\"\"\n", " assert x.ndim == 1\n", " assert stride >= 1\n", " assert len(x) >= frame_width\n", "\n", " # get the size for an element in x in bits\n", " byte = x.itemsize\n", " n_frames = (len(x) - frame_width) // stride + 1\n", " return as_strided(\n", " x,\n", " shape=(n_frames, frame_width),\n", " strides=(byte * stride, byte),\n", " writeable=writeable,\n", " )\n", "\n", "\n", "def autocorrelate1D(x):\n", " \"\"\"\n", " Autocorrelate a 1D signal `x` with itself.\n", "\n", " Notes\n", " -----\n", " The `k` th term in the 1 dimensional autocorrelation is\n", "\n", " .. math::\n", "\n", " a_k = \\sum_n x_{n + k} x_n\n", "\n", " NB. This is a naive :math:`O(N^2)` implementation. For a faster :math:`O(N\n", " \\log N)` approach using the FFT, see [1].\n", "\n", " References\n", " ----------\n", " .. [1] https://en.wikipedia.org/wiki/Autocorrelation#Efficient%computation\n", "\n", " Parameters\n", " ----------\n", " x : :py:class:`ndarray ` of shape `(N,)`\n", " A 1D signal consisting of N samples\n", "\n", " Returns\n", " -------\n", " auto : :py:class:`ndarray ` of shape `(N,)`\n", " The autocorrelation of `x` with itself\n", " \"\"\"\n", " N = len(x)\n", " auto = np.zeros(N)\n", " for k in range(N):\n", " for n in range(N - k):\n", " auto[k] += x[n + k] * x[n]\n", " return auto\n", "\n", "\n", "#######################################################################\n", "# Filters #\n", "#######################################################################\n", "\n", "\n", "def preemphasis(x, alpha):\n", " \"\"\"\n", " Increase the amplitude of high frequency bands + decrease the amplitude of\n", " lower bands.\n", "\n", " Notes\n", " -----\n", " Preemphasis filtering is (was?) a common transform in speech processing,\n", " where higher frequencies tend to be more useful during signal\n", " disambiguation.\n", "\n", " .. math::\n", "\n", " \\\\text{preemphasis}( x_t ) = x_t - \\\\alpha x_{t-1}\n", "\n", " Parameters\n", " ----------\n", " x : :py:class:`ndarray ` of shape `(N,)`\n", " A 1D signal consisting of `N` samples\n", " alpha : float in [0, 1)\n", " The preemphasis coefficient. A value of 0 corresponds to no\n", " filtering\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(N,)`\n", " The filtered signal\n", " \"\"\"\n", " return np.concatenate([x[:1], x[1:] - alpha * x[:-1]])\n", "\n", "\n", "def cepstral_lifter(mfccs, D):\n", " \"\"\"\n", " A simple sinusoidal filter applied in the Mel-frequency domain.\n", "\n", " Notes\n", " -----\n", " Cepstral lifting helps to smooth the spectral envelope and dampen the\n", " magnitude of the higher MFCC coefficients while keeping the other\n", " coefficients unchanged. The filter function is:\n", "\n", " .. math::\n", "\n", " \\\\text{lifter}( x_n ) = x_n \\left(1 + \\\\frac{D \\sin(\\pi n / D)}{2}\\\\right)\n", "\n", " Parameters\n", " ----------\n", " mfccs : :py:class:`ndarray ` of shape `(G, C)`\n", " Matrix of Mel cepstral coefficients. Rows correspond to frames, columns\n", " to cepstral coefficients\n", " D : int in :math:`[0, +\\infty]`\n", " The filter coefficient. 0 corresponds to no filtering, larger values\n", " correspond to greater amounts of smoothing\n", "\n", " Returns\n", " -------\n", " out : :py:class:`ndarray ` of shape `(G, C)`\n", " The lifter'd MFCC coefficients\n", " \"\"\"\n", " if D == 0:\n", " return mfccs\n", " n = np.arange(mfccs.shape[1])\n", " return mfccs * (1 + (D / 2) * np.sin(np.pi * n / D))\n", "\n", "\n", "def mel_spectrogram(\n", " x,\n", " window_duration=0.025,\n", " stride_duration=0.01,\n", " mean_normalize=True,\n", " window=\"hamming\",\n", " n_filters=20,\n", " center=True,\n", " alpha=0.95,\n", " fs=44000,\n", "):\n", " \"\"\"\n", " Apply the Mel-filterbank to the power spectrum for a signal `x`.\n", "\n", " Notes\n", " -----\n", " The Mel spectrogram is the projection of the power spectrum of the framed\n", " and windowed signal onto the basis set provided by the Mel filterbank.\n", "\n", " Parameters\n", " ----------\n", " x : :py:class:`ndarray ` of shape `(N,)`\n", " A 1D signal consisting of N samples\n", " window_duration : float\n", " The duration of each frame / window (in seconds). Default is 0.025.\n", " stride_duration : float\n", " The duration of the hop between consecutive windows (in seconds).\n", " Default is 0.01.\n", " mean_normalize : bool\n", " Whether to subtract the coefficient means from the final filter values\n", " to improve the signal-to-noise ratio. Default is True.\n", " window : {'hamming', 'hann', 'blackman_harris'}\n", " The windowing function to apply to the signal before FFT. Default is\n", " 'hamming'.\n", " n_filters : int\n", " The number of mel filters to include in the filterbank. Default is 20.\n", " center : bool\n", " Whether to the `k` th frame of the signal should *begin* at index ``x[k *\n", " stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]``\n", " (center = True). Default is False.\n", " alpha : float in [0, 1)\n", " The coefficient for the preemphasis filter. A value of 0 corresponds to\n", " no filtering. Default is 0.95.\n", " fs : int\n", " The sample rate/frequency for the signal. Default is 44000.\n", "\n", " Returns\n", " -------\n", " filter_energies : :py:class:`ndarray ` of shape `(G, n_filters)`\n", " The (possibly mean_normalized) power for each filter in the Mel\n", " filterbank (i.e., the Mel spectrogram). Rows correspond to frames,\n", " columns to filters\n", " energy_per_frame : :py:class:`ndarray ` of shape `(G,)`\n", " The total energy in each frame of the signal\n", " \"\"\"\n", " eps = np.finfo(float).eps\n", " window_fn = WindowInitializer()(window)\n", "\n", " stride = round(stride_duration * fs)\n", " frame_width = round(window_duration * fs)\n", " N = frame_width\n", "\n", " # add a preemphasis filter to the raw signal\n", " x = preemphasis(x, alpha)\n", "\n", " # convert signal to overlapping frames and apply a window function\n", " x = np.pad(x, N // 2, \"reflect\") if center else x\n", " frames = to_frames(x, frame_width, stride, fs)\n", "\n", " window = np.tile(window_fn(frame_width), (frames.shape[0], 1))\n", " frames = frames * window\n", "\n", " # compute the power spectrum\n", " power_spec = power_spectrum(frames)\n", " energy_per_frame = np.sum(power_spec, axis=1)\n", " energy_per_frame[energy_per_frame == 0] = eps\n", "\n", " # compute the power at each filter in the Mel filterbank\n", " fbank = mel_filterbank(N, n_filters=n_filters, fs=fs)\n", " filter_energies = power_spec @ fbank.T\n", " filter_energies -= np.mean(filter_energies, axis=0) if mean_normalize else 0\n", " filter_energies[filter_energies == 0] = eps\n", " return filter_energies, energy_per_frame\n", "\n", "\n", "#######################################################################\n", "# Mel-Frequency Features #\n", "#######################################################################\n", "\n", "\n", "def mfcc(\n", " x,\n", " fs=44000,\n", " n_mfccs=13,\n", " alpha=0.95,\n", " center=True,\n", " n_filters=20,\n", " window=\"hann\",\n", " normalize=True,\n", " lifter_coef=22,\n", " stride_duration=0.01,\n", " window_duration=0.025,\n", " replace_intercept=True,\n", "):\n", " \"\"\"\n", " Compute the Mel-frequency cepstral coefficients (MFCC) for a signal.\n", "\n", " Notes\n", " -----\n", " Computing MFCC features proceeds in the following stages:\n", "\n", " 1. Convert the signal into overlapping frames and apply a window fn\n", " 2. Compute the power spectrum at each frame\n", " 3. Apply the mel filterbank to the power spectra to get mel filterbank powers\n", " 4. Take the logarithm of the mel filterbank powers at each frame\n", " 5. Take the discrete cosine transform (DCT) of the log filterbank\n", " energies and retain only the first k coefficients to further reduce\n", " the dimensionality\n", "\n", " MFCCs were developed in the context of HMM-GMM automatic speech recognition\n", " (ASR) systems and can be used to provide a somewhat speaker/pitch\n", " invariant representation of phonemes.\n", "\n", " Parameters\n", " ----------\n", " x : :py:class:`ndarray ` of shape `(N,)`\n", " A 1D signal consisting of N samples\n", " fs : int\n", " The sample rate/frequency for the signal. Default is 44000.\n", " n_mfccs : int\n", " The number of cepstral coefficients to return (including the intercept\n", " coefficient). Default is 13.\n", " alpha : float in [0, 1)\n", " The preemphasis coefficient. A value of 0 corresponds to no\n", " filtering. Default is 0.95.\n", " center : bool\n", " Whether to the kth frame of the signal should *begin* at index ``x[k *\n", " stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]``\n", " (center = True). Default is True.\n", " n_filters : int\n", " The number of filters to include in the Mel filterbank. Default is 20.\n", " normalize : bool\n", " Whether to mean-normalize the MFCC values. Default is True.\n", " lifter_coef : int in :math:[0, + \\infty]`\n", " The cepstral filter coefficient. 0 corresponds to no filtering, larger\n", " values correspond to greater amounts of smoothing. Default is 22.\n", " window : {'hamming', 'hann', 'blackman_harris'}\n", " The windowing function to apply to the signal before taking the DFT.\n", " Default is 'hann'.\n", " stride_duration : float\n", " The duration of the hop between consecutive windows (in seconds).\n", " Default is 0.01.\n", " window_duration : float\n", " The duration of each frame / window (in seconds). Default is 0.025.\n", " replace_intercept : bool\n", " Replace the first MFCC coefficient (the intercept term) with the\n", " log of the total frame energy instead. Default is True.\n", "\n", " Returns\n", " -------\n", " mfccs : :py:class:`ndarray ` of shape `(G, C)`\n", " Matrix of Mel-frequency cepstral coefficients. Rows correspond to\n", " frames, columns to cepstral coefficients\n", " \"\"\"\n", " # map the power spectrum for the (framed + windowed representation of) `x`\n", " # onto the mel scale\n", " filter_energies, frame_energies = mel_spectrogram(\n", " x=x,\n", " fs=fs,\n", " alpha=alpha,\n", " center=center,\n", " window=window,\n", " n_filters=n_filters,\n", " mean_normalize=False,\n", " window_duration=window_duration,\n", " stride_duration=stride_duration,\n", " )\n", "\n", " log_energies = 10 * np.log10(filter_energies)\n", "\n", " # perform a DCT on the log-mel coefficients to further reduce the data\n", " # dimensionality -- the early DCT coefficients will capture the majority of\n", " # the data, allowing us to discard coefficients > n_mfccs\n", " mfccs = np.array([DCT(frame) for frame in log_energies])[:, :n_mfccs]\n", "\n", " mfccs = cepstral_lifter(mfccs, D=lifter_coef)\n", " mfccs -= np.mean(mfccs, axis=0) if normalize else 0\n", "\n", " if replace_intercept:\n", " # the 0th MFCC coefficient doesn't tell us anything about the spectrum;\n", " # replace it with the log of the frame energy for something more\n", " # informative\n", " mfccs[:, 0] = np.log(frame_energies)\n", " return mfccs\n", "\n", "\n", "def mel2hz(mel, formula=\"htk\"):\n", " \"\"\"\n", " Convert the mel-scale representation of a signal into Hz\n", "\n", " Parameters\n", " ----------\n", " mel : :py:class:`ndarray ` of shape `(N, \\*)`\n", " An array of mel frequencies to convert\n", " formula : {\"htk\", \"slaney\"}\n", " The Mel formula to use. \"htk\" uses the formula used by the Hidden\n", " Markov Model Toolkit, and described in O'Shaughnessy (1987). \"slaney\"\n", " uses the formula used in the MATLAB auditory toolbox (Slaney, 1998).\n", " Default is 'htk'\n", "\n", " Returns\n", " -------\n", " hz : :py:class:`ndarray ` of shape `(N, \\*)`\n", " The frequencies of the items in `mel`, in Hz\n", " \"\"\"\n", " fstr = \"formula must be either 'htk' or 'slaney' but got '{}'\"\n", " assert formula in [\"htk\", \"slaney\"], fstr.format(formula)\n", " if formula == \"htk\":\n", " return 700 * (10 ** (mel / 2595) - 1)\n", " raise NotImplementedError(\"slaney\")\n", "\n", "\n", "def hz2mel(hz, formula=\"htk\"):\n", " \"\"\"\n", " Convert the frequency representaiton of a signal in Hz into the mel scale.\n", "\n", " Parameters\n", " ----------\n", " hz : :py:class:`ndarray ` of shape `(N, \\*)`\n", " The frequencies of the items in `mel`, in Hz\n", " formula : {\"htk\", \"slaney\"}\n", " The Mel formula to use. \"htk\" uses the formula used by the Hidden\n", " Markov Model Toolkit, and described in O'Shaughnessy (1987). \"slaney\"\n", " uses the formula used in the MATLAB auditory toolbox (Slaney, 1998).\n", " Default is 'htk'.\n", "\n", " Returns\n", " -------\n", " mel : :py:class:`ndarray ` of shape `(N, \\*)`\n", " An array of mel frequencies to convert.\n", " \"\"\"\n", " fstr = \"formula must be either 'htk' or 'slaney' but got '{}'\"\n", " assert formula in [\"htk\", \"slaney\"], fstr.format(formula)\n", "\n", " if formula == \"htk\":\n", " return 2595 * np.log10(1 + hz / 700)\n", " raise NotImplementedError(\"slaney\")\n", "\n", "\n", "def mel_filterbank(\n", " N, n_filters=20, fs=44000, min_freq=0, max_freq=None, normalize=True\n", "):\n", " \"\"\"\n", " Compute the filters in a Mel filterbank and return the corresponding\n", " transformation matrix\n", "\n", " Notes\n", " -----\n", " The Mel scale is a perceptual scale designed to simulate the way the human\n", " ear works. Pitches judged by listeners to be equal in perceptual /\n", " psychological distance have equal distance on the Mel scale. Practically,\n", " this corresponds to a scale with higher resolution at low frequencies and\n", " lower resolution at higher (> 500 Hz) frequencies.\n", "\n", " Each filter in the Mel filterbank is triangular with a response of 1 at its\n", " center and a linear decay on both sides until it reaches the center\n", " frequency of the next adjacent filter.\n", "\n", " This implementation is based on code in the (superb) LibROSA package [1].\n", "\n", " References\n", " ----------\n", " .. [1] McFee et al. (2015). \"librosa: Audio and music signal analysis in\n", " Python\", *Proceedings of the 14th Python in Science Conference*\n", " https://librosa.github.io\n", "\n", " Parameters\n", " ----------\n", " N : int\n", " The number of DFT bins\n", " n_filters : int\n", " The number of mel filters to include in the filterbank. Default is 20.\n", " min_freq : int\n", " Minimum filter frequency (in Hz). Default is 0.\n", " max_freq : int\n", " Maximum filter frequency (in Hz). Default is 0.\n", " fs : int\n", " The sample rate/frequency for the signal. Default is 44000.\n", " normalize : bool\n", " If True, scale the Mel filter weights by their area in Mel space.\n", " Default is True.\n", "\n", " Returns\n", " -------\n", " fbank : :py:class:`ndarray ` of shape `(n_filters, N // 2 + 1)`\n", " The mel-filterbank transformation matrix. Rows correspond to filters,\n", " columns to DFT bins.\n", " \"\"\"\n", " max_freq = fs / 2 if max_freq is None else max_freq\n", " min_mel, max_mel = hz2mel(min_freq), hz2mel(max_freq)\n", "\n", " fbank = np.zeros((n_filters, N // 2 + 1))\n", "\n", " # uniformly spaced values on the mel scale, translated back into Hz\n", " mel_bins = mel2hz(np.linspace(min_mel, max_mel, n_filters + 2))\n", "\n", " # the centers of the frequency bins for the DFT\n", " hz_bins = dft_bins(N, fs)\n", "\n", " mel_spacing = np.diff(mel_bins)\n", "\n", " # ramps[i] = mel_bins[i] - hz_bins\n", " ramps = mel_bins.reshape(-1, 1) - hz_bins.reshape(1, -1)\n", " for i in range(n_filters):\n", " # calc the filter values on the left and right across the bins ...\n", " left = -ramps[i] / mel_spacing[i]\n", " right = ramps[i + 2] / mel_spacing[i + 1]\n", "\n", " # .. and set them zero when they cross the x-axis\n", " fbank[i] = np.maximum(0, np.minimum(left, right))\n", "\n", " if normalize:\n", " energy_norm = 2.0 / (mel_bins[2 : n_filters + 2] - mel_bins[:n_filters])\n", " fbank *= energy_norm[:, np.newaxis]\n", "\n", " return fbank\n"]} {"path": "numpy_ml/preprocessing/__init__.py", "content": ["from . import general\n", "from . import nlp\n", "from . import dsp\n"]} {"path": "numpy_ml/preprocessing/nlp.py", "content": ["\"\"\"Common preprocessing utilities for working with text data\"\"\"\n", "import re\n", "import heapq\n", "import os.path as op\n", "from collections import Counter, OrderedDict, defaultdict\n", "\n", "import numpy as np\n", "\n", "\n", "# This list of English stop words is taken from the \"Glasgow Information\n", "# Retrieval Group\". The original list can be found at\n", "# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words\n", "_STOP_WORDS = set(\n", " (\n", " \"a about above across after afterwards again against all almost alone \"\n", " \"along already also although always am among amongst amoungst amount an \"\n", " \"and another any anyhow anyone anything anyway anywhere are around as at \"\n", " \"back be became because become becomes becoming been before beforehand \"\n", " \"behind being below beside besides between beyond bill both bottom but by \"\n", " \"call can cannot cant co con could couldnt cry de describe detail do done \"\n", " \"down due during each eg eight either eleven else elsewhere empty enough \"\n", " \"etc even ever every everyone everything everywhere except few fifteen \"\n", " \"fifty fill find fire first five for former formerly forty found four from \"\n", " \"front full further get give go had has hasnt have he hence her here \"\n", " \"hereafter hereby herein hereupon hers herself him himself his how however \"\n", " \"hundred i ie if in inc indeed interest into is it its itself keep last \"\n", " \"latter latterly least less ltd made many may me meanwhile might mill mine \"\n", " \"more moreover most mostly move much must my myself name namely neither \"\n", " \"never nevertheless next nine no nobody none noone nor not nothing now \"\n", " \"nowhere of off often on once one only onto or other others otherwise our \"\n", " \"ours ourselves out over own part per perhaps please put rather re same see \"\n", " \"seem seemed seeming seems serious several she should show side since \"\n", " \"sincere six sixty so some somehow someone something sometime sometimes \"\n", " \"somewhere still such system take ten than that the their them themselves \"\n", " \"then thence there thereafter thereby therefore therein thereupon these \"\n", " \"they thick thin third this those though three through throughout thru thus \"\n", " \"to together too top toward towards twelve twenty two un under until up \"\n", " \"upon us very via was we well were what whatever when whence whenever where \"\n", " \"whereafter whereas whereby wherein whereupon wherever whether which while \"\n", " \"whither who whoever whole whom whose why will with within without would \"\n", " \"yet you your yours yourself yourselves\"\n", " ).split(\" \"),\n", ")\n", "\n", "_WORD_REGEX = re.compile(r\"(?u)\\b\\w\\w+\\b\") # sklearn default\n", "_WORD_REGEX_W_PUNC = re.compile(r\"(?u)\\w+|[^a-zA-Z0-9\\s]\")\n", "_WORD_REGEX_W_PUNC_AND_WHITESPACE = re.compile(r\"(?u)s?\\w+\\s?|\\s?[^a-zA-Z0-9\\s]\\s?\")\n", "\n", "_PUNC_BYTE_REGEX = re.compile(\n", " r\"(33|34|35|36|37|38|39|40|41|42|43|44|45|\"\n", " r\"46|47|58|59|60|61|62|63|64|91|92|93|94|\"\n", " r\"95|96|123|124|125|126)\",\n", ")\n", "_PUNCTUATION = \"!\\\"#$%&'()*+,-./:;<=>?@[\\\\]^_`{|}~\"\n", "_PUNC_TABLE = str.maketrans(\"\", \"\", _PUNCTUATION)\n", "\n", "\n", "def ngrams(sequence, N):\n", " \"\"\"Return all `N`-grams of the elements in `sequence`\"\"\"\n", " assert N >= 1\n", " return list(zip(*[sequence[i:] for i in range(N)]))\n", "\n", "\n", "def tokenize_whitespace(\n", " line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs,\n", "):\n", " \"\"\"\n", " Split a string at any whitespace characters, optionally removing\n", " punctuation and stop-words in the process.\n", " \"\"\"\n", " line = line.lower() if lowercase else line\n", " words = line.split()\n", " line = [strip_punctuation(w) for w in words] if filter_punctuation else line\n", " return remove_stop_words(words) if filter_stopwords else words\n", "\n", "\n", "def tokenize_words(\n", " line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs,\n", "):\n", " \"\"\"\n", " Split a string into individual words, optionally removing punctuation and\n", " stop-words in the process.\n", " \"\"\"\n", " REGEX = _WORD_REGEX if filter_punctuation else _WORD_REGEX_W_PUNC\n", " words = REGEX.findall(line.lower() if lowercase else line)\n", " return remove_stop_words(words) if filter_stopwords else words\n", "\n", "\n", "def tokenize_words_bytes(\n", " line,\n", " lowercase=True,\n", " filter_stopwords=True,\n", " filter_punctuation=True,\n", " encoding=\"utf-8\",\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Split a string into individual words, optionally removing punctuation and\n", " stop-words in the process. Translate each word into a list of bytes.\n", " \"\"\"\n", " words = tokenize_words(\n", " line,\n", " lowercase=lowercase,\n", " filter_stopwords=filter_stopwords,\n", " filter_punctuation=filter_punctuation,\n", " **kwargs,\n", " )\n", " words = [\" \".join([str(i) for i in w.encode(encoding)]) for w in words]\n", " return words\n", "\n", "\n", "def tokenize_bytes_raw(line, encoding=\"utf-8\", splitter=None, **kwargs):\n", " \"\"\"\n", " Convert the characters in `line` to a collection of bytes. Each byte is\n", " represented in decimal as an integer between 0 and 255.\n", "\n", " Parameters\n", " ----------\n", " line : str\n", " The string to tokenize.\n", " encoding : str\n", " The encoding scheme for the characters in `line`. Default is `'utf-8'`.\n", " splitter : {'punctuation', None}\n", " If `'punctuation'`, split the string at any punctuation character\n", " before encoding into bytes. If None, do not split `line` at all.\n", " Default is None.\n", "\n", " Returns\n", " -------\n", " bytes : list\n", " A list of the byte-encoded characters in `line`. Each item in the list\n", " is a string of space-separated integers between 0 and 255 representing\n", " the bytes encoding the characters in `line`.\n", " \"\"\"\n", " byte_str = [\" \".join([str(i) for i in line.encode(encoding)])]\n", " if splitter == \"punctuation\":\n", " byte_str = _PUNC_BYTE_REGEX.sub(r\"-\\1-\", byte_str[0]).split(\"-\")\n", " return byte_str\n", "\n", "\n", "def bytes_to_chars(byte_list, encoding=\"utf-8\"):\n", " \"\"\"\n", " Decode bytes (represented as an integer between 0 and 255) to characters in\n", " the specified encoding.\n", " \"\"\"\n", " hex_array = [hex(a).replace(\"0x\", \"\") for a in byte_list]\n", " hex_array = \" \".join([h if len(h) > 1 else f\"0{h}\" for h in hex_array])\n", " return bytearray.fromhex(hex_array).decode(encoding)\n", "\n", "\n", "def tokenize_chars(line, lowercase=True, filter_punctuation=True, **kwargs):\n", " \"\"\"\n", " Split a string into individual characters, optionally removing punctuation\n", " and stop-words in the process.\n", " \"\"\"\n", " line = line.lower() if lowercase else line\n", " line = strip_punctuation(line) if filter_punctuation else line\n", " chars = list(re.sub(\" {2,}\", \" \", line).strip())\n", " return chars\n", "\n", "\n", "def remove_stop_words(words):\n", " \"\"\"Remove stop words from a list of word strings\"\"\"\n", " return [w for w in words if w.lower() not in _STOP_WORDS]\n", "\n", "\n", "def strip_punctuation(line):\n", " \"\"\"Remove punctuation from a string\"\"\"\n", " return line.translate(_PUNC_TABLE).strip()\n", "\n", "\n", "#######################################################################\n", "# Byte-Pair Encoder #\n", "#######################################################################\n", "\n", "\n", "class BytePairEncoder(object):\n", " def __init__(self, max_merges=3000, encoding=\"utf-8\"):\n", " \"\"\"\n", " A byte-pair encoder for sub-word embeddings.\n", "\n", " Notes\n", " -----\n", " Byte-pair encoding [1][2] is a compression algorithm that iteratively\n", " replaces the most frequently ocurring byte pairs in a set of documents\n", " with a new, single token. It has gained popularity as a preprocessing\n", " step for many NLP tasks due to its simplicity and expressiveness: using\n", " a base coebook of just 256 unique tokens (bytes), any string can be\n", " encoded.\n", "\n", " References\n", " ----------\n", " .. [1] Gage, P. (1994). A new algorithm for data compression. *C\n", " Users Journal, 12(2)*, 23\u201338.\n", " .. [2] Sennrich, R., Haddow, B., & Birch, A. (2015). Neural machine\n", " translation of rare words with subword units, *Proceedings of the\n", " 54th Annual Meeting of the Association for Computational\n", " Linguistics,* 1715-1725.\n", "\n", " Parameters\n", " ----------\n", " max_merges : int\n", " The maximum number of byte pair merges to perform during the\n", " :meth:`fit` operation. Default is 3000.\n", " encoding : str\n", " The encoding scheme for the documents used to train the encoder.\n", " Default is `'utf-8'`.\n", " \"\"\"\n", " self.parameters = {\n", " \"max_merges\": max_merges,\n", " \"encoding\": encoding,\n", " }\n", "\n", " # initialize the byte <-> token and token <-> byte dictionaries. bytes\n", " # are represented in decimal as integers between 0 and 255. there is a\n", " # 1:1 correspondence between token and byte representations up to 255.\n", " self.byte2token = OrderedDict({i: i for i in range(256)})\n", " self.token2byte = OrderedDict({v: k for k, v in self.byte2token.items()})\n", "\n", " def fit(self, corpus_fps, encoding=\"utf-8\"):\n", " \"\"\"\n", " Train a byte pair codebook on a set of documents.\n", "\n", " Parameters\n", " ----------\n", " corpus_fps : str or list of strs\n", " The filepath / list of filepaths for the document(s) to be used to\n", " learn the byte pair codebook.\n", " encoding : str\n", " The text encoding for documents. Common entries are either 'utf-8'\n", " (no header byte), or 'utf-8-sig' (header byte). Default is\n", " 'utf-8'.\n", " \"\"\"\n", " vocab = (\n", " Vocabulary(\n", " lowercase=False,\n", " min_count=None,\n", " max_tokens=None,\n", " filter_stopwords=False,\n", " filter_punctuation=False,\n", " tokenizer=\"bytes\",\n", " )\n", " .fit(corpus_fps, encoding=encoding)\n", " .counts\n", " )\n", "\n", " # iteratively merge the most common byte bigram across the documents\n", " for _ in range(self.parameters[\"max_merges\"]):\n", " pair_counts = self._get_counts(vocab)\n", " most_common_bigram = max(pair_counts, key=pair_counts.get)\n", " vocab = self._merge(most_common_bigram, vocab)\n", "\n", " token_bytes = set()\n", " for k in vocab.keys():\n", " token_bytes = token_bytes.union([w for w in k.split(\" \") if \"-\" in w])\n", "\n", " for i, t in enumerate(token_bytes):\n", " byte_tuple = tuple(int(j) for j in t.split(\"-\"))\n", " self.token2byte[256 + i] = byte_tuple\n", " self.byte2token[byte_tuple] = 256 + i\n", "\n", " return self\n", "\n", " def _get_counts(self, vocab):\n", " \"\"\"Collect bigram counts for the tokens in vocab\"\"\"\n", " pair_counts = defaultdict(int)\n", " for word, count in vocab.items():\n", " pairs = ngrams(word.split(\" \"), 2)\n", " for p in pairs:\n", " pair_counts[p] += count\n", " return pair_counts\n", "\n", " def _merge(self, bigram, vocab):\n", " \"\"\"Replace `bigram` with a single token and update vocab accordingly\"\"\"\n", " v_out = {}\n", " bg = re.escape(\" \".join(bigram))\n", " bigram_regex = re.compile(r\"(?>> B = BytePairEncoder(max_merges=100).fit(\"./example.txt\")\n", " >>> encoded_tokens = B.transform(\"Hello! How are you \ud83d\ude01 ?\")\n", " >>> encoded_tokens\n", " [[72, 879, 474, ...]]\n", " \"\"\"\n", " if isinstance(text, str):\n", " text = [text]\n", " return [self._transform(string) for string in text]\n", "\n", " def _transform(self, text):\n", " \"\"\"Transform a single text string to a list of byte-pair IDs\"\"\"\n", " P = self.parameters\n", " _bytes = tokenize_bytes_raw(text, encoding=P[\"encoding\"])\n", "\n", " encoded = []\n", " for w in _bytes:\n", " l, r = 0, len(w)\n", " w = [int(i) for i in w.split(\" \")]\n", "\n", " while l < len(w):\n", " candidate = tuple(w[l:r])\n", "\n", " if len(candidate) > 1 and candidate in self.byte2token:\n", " # candidate is a collection of several bytes and is in our\n", " # vocab\n", " encoded.append(self.byte2token[candidate])\n", " l, r = r, len(w)\n", " elif len(candidate) == 1:\n", " # candidate is a single byte and should always be in our\n", " # vocab\n", " encoded.append(candidate[0])\n", " l, r = r, len(w)\n", " else:\n", " # candidate is not in vocab, so we decrease our context\n", " # window by 1 and try again\n", " r -= 1\n", " return encoded\n", "\n", " def inverse_transform(self, codes):\n", " \"\"\"\n", " Transform an encoded sequence of byte pair codeword IDs back into\n", " human-readable text.\n", "\n", " Parameters\n", " ----------\n", " codes : list of `N` lists\n", " A list of `N` lists. Each sublist is a collection of integer\n", " byte-pair token IDs representing a particular text string.\n", "\n", " Returns\n", " -------\n", " text: list of `N` strings\n", " The decoded strings corresponding to the `N` sublists in `codes`.\n", "\n", " Examples\n", " --------\n", " >>> B = BytePairEncoder(max_merges=100).fit(\"./example.txt\")\n", " >>> encoded_tokens = B.transform(\"Hello! How are you \ud83d\ude01 ?\")\n", " >>> encoded_tokens\n", " [[72, 879, 474, ...]]\n", " >>> B.inverse_transform(encoded_tokens)\n", " [\"Hello! How are you \ud83d\ude01 ?\"]\n", " \"\"\"\n", " if isinstance(codes[0], int):\n", " codes = [codes]\n", "\n", " decoded = []\n", " P = self.parameters\n", "\n", " for code in codes:\n", " _bytes = [self.token2byte[t] if t > 255 else [t] for t in code]\n", " _bytes = [b for blist in _bytes for b in blist]\n", " decoded.append(bytes_to_chars(_bytes, encoding=P[\"encoding\"]))\n", " return decoded\n", "\n", " @property\n", " def codebook(self):\n", " \"\"\"\n", " A list of the learned byte pair codewords, decoded into human-readable\n", " format\n", " \"\"\"\n", " return [\n", " self.inverse_transform(t)[0]\n", " for t in self.byte2token.keys()\n", " if isinstance(t, tuple)\n", " ]\n", "\n", " @property\n", " def tokens(self):\n", " \"\"\"A list of the byte pair codeword IDs\"\"\"\n", " return list(self.token2byte.keys())\n", "\n", "\n", "#######################################################################\n", "# Huffman Tree #\n", "#######################################################################\n", "\n", "\n", "class Node(object):\n", " def __init__(self, key, val):\n", " self.key = key\n", " self.val = val\n", " self.left = None\n", " self.right = None\n", "\n", " def __gt__(self, other):\n", " \"\"\"Greater than\"\"\"\n", " if not isinstance(other, Node):\n", " return -1\n", " return self.val > other.val\n", "\n", " def __ge__(self, other):\n", " \"\"\"Greater than or equal to\"\"\"\n", " if not isinstance(other, Node):\n", " return -1\n", " return self.val >= other.val\n", "\n", " def __lt__(self, other):\n", " \"\"\"Less than\"\"\"\n", " if not isinstance(other, Node):\n", " return -1\n", " return self.val < other.val\n", "\n", " def __le__(self, other):\n", " \"\"\"Less than or equal to\"\"\"\n", " if not isinstance(other, Node):\n", " return -1\n", " return self.val <= other.val\n", "\n", "\n", "class HuffmanEncoder(object):\n", " def fit(self, text):\n", " \"\"\"\n", " Build a Huffman tree for the tokens in `text` and compute each token's\n", " binary encoding.\n", "\n", " Notes\n", " -----\n", " In a Huffman code, tokens that occur more frequently are (generally)\n", " represented using fewer bits. Huffman codes produce the minimum expected\n", " codeword length among all methods for encoding tokens individually.\n", "\n", " Huffman codes correspond to paths through a binary tree, with 1\n", " corresponding to \"move right\" and 0 corresponding to \"move left\". In\n", " contrast to standard binary trees, the Huffman tree is constructed from the\n", " bottom up. Construction begins by initializing a min-heap priority queue\n", " consisting of each token in the corpus, with priority corresponding to the\n", " token frequency. At each step, the two most infrequent tokens in the corpus\n", " are removed and become the children of a parent pseudotoken whose\n", " \"frequency\" is the sum of the frequencies of its children. This new parent\n", " pseudotoken is added to the priority queue and the process is repeated\n", " recursively until no tokens remain.\n", "\n", " Parameters\n", " ----------\n", " text: list of strs or :class:`Vocabulary` instance\n", " The tokenized text or a pretrained :class:`Vocabulary` object to use for\n", " building the Huffman code.\n", " \"\"\"\n", " self._build_tree(text)\n", " self._generate_codes()\n", "\n", " def transform(self, text):\n", " \"\"\"\n", " Transform the words in `text` into their Huffman-code representations.\n", "\n", " Parameters\n", " ----------\n", " text: list of `N` strings\n", " The list of words to encode\n", "\n", " Returns\n", " -------\n", " codes : list of `N` binary strings\n", " The encoded words in `text`\n", " \"\"\"\n", " if isinstance(text, str):\n", " text = [text]\n", " for token in set(text):\n", " if token not in self._item2code:\n", " raise Warning(\"Token '{}' not in Huffman tree. Skipping\".format(token))\n", " return [self._item2code.get(t, None) for t in text]\n", "\n", " def inverse_transform(self, codes):\n", " \"\"\"\n", " Transform an encoded sequence of bit-strings back into words.\n", "\n", " Parameters\n", " ----------\n", " codes : list of `N` binary strings\n", " A list of encoded bit-strings, represented as strings.\n", "\n", " Returns\n", " -------\n", " text: list of `N` strings\n", " The decoded text.\n", " \"\"\"\n", " if isinstance(codes, str):\n", " codes = [codes]\n", " for code in set(codes):\n", " if code not in self._code2item:\n", " raise Warning(\"Code '{}' not in Huffman tree. Skipping\".format(code))\n", " return [self._code2item.get(c, None) for c in codes]\n", "\n", " @property\n", " def tokens(self):\n", " \"\"\"A list the unique tokens in `text`\"\"\"\n", " return list(self._item2code.keys())\n", "\n", " @property\n", " def codes(self):\n", " \"\"\"A list with the Huffman code for each unique token in `text`\"\"\"\n", " return list(self._code2item.keys())\n", "\n", " def _counter(self, text):\n", " counts = {}\n", " for item in text:\n", " counts[item] = counts.get(item, 0) + 1\n", " return counts\n", "\n", " def _build_tree(self, text):\n", " \"\"\"Construct Huffman Tree\"\"\"\n", " PQ = []\n", "\n", " if isinstance(text, Vocabulary):\n", " counts = text.counts\n", " else:\n", " counts = self._counter(text)\n", "\n", " for (k, c) in counts.items():\n", " PQ.append(Node(k, c))\n", "\n", " # create a priority queue with priority = item frequency\n", " heapq.heapify(PQ)\n", "\n", " while len(PQ) > 1:\n", " node1 = heapq.heappop(PQ) # item with smallest frequency\n", " node2 = heapq.heappop(PQ) # item with second smallest frequency\n", "\n", " parent = Node(None, node1.val + node2.val)\n", " parent.left = node1\n", " parent.right = node2\n", "\n", " heapq.heappush(PQ, parent)\n", "\n", " self._root = heapq.heappop(PQ)\n", "\n", " def _generate_codes(self):\n", " current_code = \"\"\n", " self._item2code = {}\n", " self._code2item = {}\n", " self._build_code(self._root, current_code)\n", "\n", " def _build_code(self, root, current_code):\n", " if root is None:\n", " return\n", "\n", " if root.key is not None:\n", " self._item2code[root.key] = current_code\n", " self._code2item[current_code] = root.key\n", " return\n", "\n", " # 0 = move left, 1 = move right\n", " self._build_code(root.left, current_code + \"0\")\n", " self._build_code(root.right, current_code + \"1\")\n", "\n", "\n", "#######################################################################\n", "# Containers #\n", "#######################################################################\n", "\n", "\n", "class Token:\n", " def __init__(self, word):\n", " self.count = 0\n", " self.word = word\n", "\n", " def __repr__(self):\n", " \"\"\"A string representation of the token\"\"\"\n", " return \"Token(word='{}', count={})\".format(self.word, self.count)\n", "\n", "\n", "class TFIDFEncoder:\n", " def __init__(\n", " self,\n", " vocab=None,\n", " lowercase=True,\n", " min_count=0,\n", " smooth_idf=True,\n", " max_tokens=None,\n", " input_type=\"files\",\n", " filter_stopwords=True,\n", " filter_punctuation=True,\n", " tokenizer=\"words\",\n", " ):\n", " r\"\"\"\n", " An object for compiling and encoding the term-frequency\n", " inverse-document-frequency (TF-IDF) representation of the tokens in a\n", " text corpus.\n", "\n", " Notes\n", " -----\n", " TF-IDF is intended to reflect how important a word is to a document in\n", " a collection or corpus. For a word token `w` in a document `d`, and a\n", " corpus, :math:`D = \\{d_1, \\ldots, d_N\\}`, we have:\n", "\n", " .. math::\n", " \\text{TF}(w, d) &= \\text{num. occurences of }w \\text{ in document }d \\\\\n", " \\text{IDF}(w, D) &= \\log \\frac{|D|}{|\\{ d \\in D: t \\in d \\}|}\n", "\n", " Parameters\n", " ----------\n", " vocab : :class:`Vocabulary` object or list-like\n", " An existing vocabulary to filter the tokens in the corpus against.\n", " Default is None.\n", " lowercase : bool\n", " Whether to convert each string to lowercase before tokenization.\n", " Default is True.\n", " min_count : int\n", " Minimum number of times a token must occur in order to be included\n", " in vocab. Default is 0.\n", " smooth_idf : bool\n", " Whether to add 1 to the denominator of the IDF calculation to avoid\n", " divide-by-zero errors. Default is True.\n", " max_tokens : int\n", " Only add the `max_tokens` most frequent tokens that occur more\n", " than `min_count` to the vocabulary. If None, add all tokens\n", " greater that occur more than than `min_count`. Default is None.\n", " input_type : {'files', 'strings'}\n", " If 'files', the sequence input to `fit` is expected to be a list\n", " of filepaths. If 'strings', the input is expected to be a list of\n", " lists, each sublist containing the raw strings for a single\n", " document in the corpus. Default is 'filename'.\n", " filter_stopwords : bool\n", " Whether to remove stopwords before encoding the words in the\n", " corpus. Default is True.\n", " filter_punctuation : bool\n", " Whether to remove punctuation before encoding the words in the\n", " corpus. Default is True.\n", " tokenizer : {'whitespace', 'words', 'characters', 'bytes'}\n", " Strategy to follow when mapping strings to tokens. The\n", " `'whitespace'` tokenizer splits strings at whitespace characters.\n", " The `'words'` tokenizer splits strings using a \"word\" regex. The\n", " `'characters'` tokenizer splits strings into individual characters.\n", " The `'bytes'` tokenizer splits strings into a collection of\n", " individual bytes.\n", " \"\"\"\n", " # create a function to filter against words in the vocab\n", " self._filter_vocab = lambda words: words\n", " if isinstance(vocab, Vocabulary):\n", " self._filter_vocab = vocab.filter\n", " elif isinstance(vocab, (list, np.ndarray, set)):\n", " vocab = set(vocab)\n", " self._filter_vocab = lambda words: [\n", " w if w in vocab else \"\" for w in words\n", " ]\n", "\n", " if input_type not in [\"files\", \"strings\"]:\n", " fstr = \"`input_type` must be either 'files' or 'strings', but got {}\"\n", " raise ValueError(fstr.format(input_type))\n", "\n", " self._tokens = None\n", " self._idx2doc = None\n", " self.term_freq = None\n", " self.idx2token = None\n", " self.token2idx = None\n", " self.inv_doc_freq = None\n", "\n", " self.hyperparameters = {\n", " \"id\": \"TFIDFEncoder\",\n", " \"encoding\": None,\n", " \"vocab\": vocab\n", " if not isinstance(vocab, Vocabulary)\n", " else vocab.hyperparameters,\n", " \"lowercase\": lowercase,\n", " \"min_count\": min_count,\n", " \"input_type\": input_type,\n", " \"max_tokens\": max_tokens,\n", " \"smooth_idf\": smooth_idf,\n", " \"tokenizer\": tokenizer\n", " if not isinstance(vocab, Vocabulary)\n", " else vocab.hyperparameters[\"tokenizer\"],\n", " \"filter_stopwords\": filter_stopwords\n", " if not isinstance(vocab, Vocabulary)\n", " else vocab.hyperparameters[\"filter_stopwords\"],\n", " \"filter_punctuation\": filter_punctuation\n", " if not isinstance(vocab, Vocabulary)\n", " else vocab.hyperparameters[\"filter_punctuation\"],\n", " }\n", "\n", " def fit(self, corpus_seq, encoding=\"utf-8-sig\"):\n", " \"\"\"\n", " Compute term-frequencies and inverse document frequencies on a\n", " collection of documents.\n", "\n", " Parameters\n", " ----------\n", " corpus_seq : str or list of strs\n", " The filepath / list of filepaths / raw string contents of the\n", " document(s) to be encoded, in accordance with the `input_type`\n", " parameter passed to the :meth:`__init__` method. Each document is\n", " expected to be a string of tokens separated by whitespace.\n", " encoding : str\n", " Specifies the text encoding for corpus if `input_type` is `files`.\n", " Common entries are either 'utf-8' (no header byte), or 'utf-8-sig'\n", " (header byte). Default is 'utf-8-sig'.\n", "\n", " Returns\n", " -------\n", " self\n", " \"\"\"\n", " H = self.hyperparameters\n", "\n", " if isinstance(corpus_seq, str):\n", " corpus_seq = [corpus_seq]\n", "\n", " if H[\"input_type\"] == \"files\":\n", " for corpus_fp in corpus_seq:\n", " assert op.isfile(corpus_fp), \"{} does not exist\".format(corpus_fp)\n", "\n", " tokens = []\n", " idx2token, token2idx = {}, {}\n", "\n", " # encode special tokens\n", " for tt in [\"\", \"\", \"\"]:\n", " token2idx[tt] = len(tokens)\n", " idx2token[len(tokens)] = tt\n", " tokens.append(Token(tt))\n", "\n", " min_count = H[\"min_count\"]\n", " max_tokens = H[\"max_tokens\"]\n", " H[\"encoding\"] = encoding\n", "\n", " bol_ix = token2idx[\"\"]\n", " eol_ix = token2idx[\"\"]\n", " idx2doc, term_freq = {}, {}\n", "\n", " # encode the text in `corpus_fps` without any filtering ...\n", " for d_ix, doc in enumerate(corpus_seq):\n", " doc_count = {}\n", " idx2doc[d_ix] = doc if H[\"input_type\"] == \"files\" else None\n", " token2idx, idx2token, tokens, doc_count = self._encode_document(\n", " doc, token2idx, idx2token, tokens, doc_count, bol_ix, eol_ix,\n", " )\n", " term_freq[d_ix] = doc_count\n", "\n", " self._tokens = tokens\n", " self._idx2doc = idx2doc\n", " self.token2idx = token2idx\n", " self.idx2token = idx2token\n", " self.term_freq = term_freq\n", "\n", " # ... retain only the top `max_tokens` most frequent tokens, coding\n", " # everything else as ...\n", " if max_tokens is not None and len(tokens) > max_tokens:\n", " self._keep_top_n_tokens()\n", "\n", " # ... replace all words occurring less than `min_count` by ...\n", " if min(self._tokens, key=lambda t: t.count).count < min_count:\n", " self._drop_low_freq_tokens()\n", "\n", " # ... sort tokens alphabetically and reindex ...\n", " self._sort_tokens()\n", "\n", " # ... finally, calculate inverse document frequency\n", " self._calc_idf()\n", " return self\n", "\n", " def _encode_document(\n", " self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,\n", " ):\n", " \"\"\"Perform tokenization and compute token counts for a single document\"\"\"\n", " H = self.hyperparameters\n", " lowercase = H[\"lowercase\"]\n", " filter_stop = H[\"filter_stopwords\"]\n", " filter_punc = H[\"filter_punctuation\"]\n", "\n", " if H[\"input_type\"] == \"files\":\n", " with open(doc, \"r\", encoding=H[\"encoding\"]) as handle:\n", " doc = handle.read()\n", "\n", " tokenizer_dict = {\n", " \"words\": tokenize_words,\n", " \"characters\": tokenize_chars,\n", " \"whitespace\": tokenize_whitespace,\n", " \"bytes\": tokenize_bytes_raw,\n", " }\n", " tokenizer = tokenizer_dict[H[\"tokenizer\"]]\n", "\n", " n_words = 0\n", " lines = doc.split(\"\\n\")\n", " for line in lines:\n", " words = tokenizer(\n", " line,\n", " lowercase=lowercase,\n", " filter_stopwords=filter_stop,\n", " filter_punctuation=filter_punc,\n", " encoding=H[\"encoding\"],\n", " )\n", " words = self._filter_vocab(words)\n", " n_words += len(words)\n", "\n", " for ww in words:\n", " if ww not in word2idx:\n", " word2idx[ww] = len(tokens)\n", " idx2word[len(tokens)] = ww\n", " tokens.append(Token(ww))\n", "\n", " t_idx = word2idx[ww]\n", " tokens[t_idx].count += 1\n", " doc_count[t_idx] = doc_count.get(t_idx, 0) + 1\n", "\n", " # wrap line in and tags\n", " tokens[bol_ix].count += 1\n", " tokens[eol_ix].count += 1\n", "\n", " doc_count[bol_ix] = doc_count.get(bol_ix, 0) + 1\n", " doc_count[eol_ix] = doc_count.get(eol_ix, 0) + 1\n", " return word2idx, idx2word, tokens, doc_count\n", "\n", " def _keep_top_n_tokens(self):\n", " N = self.hyperparameters[\"max_tokens\"]\n", " doc_counts, word2idx, idx2word = {}, {}, {}\n", " tokens = sorted(self._tokens, key=lambda x: x.count, reverse=True)\n", "\n", " # reindex the top-N tokens...\n", " unk_ix = None\n", " for idx, tt in enumerate(tokens[:N]):\n", " word2idx[tt.word] = idx\n", " idx2word[idx] = tt.word\n", "\n", " if tt.word == \"\":\n", " unk_ix = idx\n", "\n", " # ... if isn't in the top-N, add it, replacing the Nth\n", " # most-frequent word and adjust the count accordingly ...\n", " if unk_ix is None:\n", " unk_ix = self.token2idx[\"\"]\n", " old_count = tokens[N - 1].count\n", " tokens[N - 1] = self._tokens[unk_ix]\n", " tokens[N - 1].count += old_count\n", " word2idx[\"\"] = N - 1\n", " idx2word[N - 1] = \"\"\n", "\n", " # ... and recode all dropped tokens as \"\"\n", " for tt in tokens[N:]:\n", " tokens[unk_ix].count += tt.count\n", "\n", " # ... finally, reindex the word counts for each document\n", " doc_counts = {}\n", " for d_ix in self.term_freq.keys():\n", " doc_counts[d_ix] = {}\n", " for old_ix, d_count in self.term_freq[d_ix].items():\n", " word = self.idx2token[old_ix]\n", " new_ix = word2idx.get(word, unk_ix)\n", " doc_counts[d_ix][new_ix] = doc_counts[d_ix].get(new_ix, 0) + d_count\n", "\n", " self._tokens = tokens[:N]\n", " self.token2idx = word2idx\n", " self.idx2token = idx2word\n", " self.term_freq = doc_counts\n", "\n", " assert len(self._tokens) <= N\n", "\n", " def _drop_low_freq_tokens(self):\n", " \"\"\"\n", " Replace all tokens that occur less than `min_count` with the ``\n", " token.\n", " \"\"\"\n", " H = self.hyperparameters\n", " unk_token = self._tokens[self.token2idx[\"\"]]\n", " eol_token = self._tokens[self.token2idx[\"\"]]\n", " bol_token = self._tokens[self.token2idx[\"\"]]\n", " tokens = [unk_token, eol_token, bol_token]\n", "\n", " unk_idx = 0\n", " word2idx = {\"\": 0, \"\": 1, \"\": 2}\n", " idx2word = {0: \"\", 1: \"\", 2: \"\"}\n", " special = {\"\", \"\", \"\"}\n", "\n", " for tt in self._tokens:\n", " if tt.word not in special:\n", " if tt.count < H[\"min_count\"]:\n", " tokens[unk_idx].count += tt.count\n", " else:\n", " word2idx[tt.word] = len(tokens)\n", " idx2word[len(tokens)] = tt.word\n", " tokens.append(tt)\n", "\n", " # reindex document counts\n", " doc_counts = {}\n", " for d_idx in self.term_freq.keys():\n", " doc_counts[d_idx] = {}\n", " for old_idx, d_count in self.term_freq[d_idx].items():\n", " word = self.idx2token[old_idx]\n", " new_idx = word2idx.get(word, unk_idx)\n", " doc_counts[d_idx][new_idx] = doc_counts[d_idx].get(new_idx, 0) + d_count\n", "\n", " self._tokens = tokens\n", " self.token2idx = word2idx\n", " self.idx2token = idx2word\n", " self.term_freq = doc_counts\n", "\n", " def _sort_tokens(self):\n", " # sort tokens alphabetically and recode\n", " ix = 0\n", " token2idx, idx2token, = (\n", " {},\n", " {},\n", " )\n", " special = [\"\", \"\", \"\"]\n", " words = sorted(self.token2idx.keys())\n", " term_freq = {d: {} for d in self.term_freq.keys()}\n", "\n", " for w in words:\n", " if w not in special:\n", " old_ix = self.token2idx[w]\n", " token2idx[w], idx2token[ix] = ix, w\n", " for d in self.term_freq.keys():\n", " if old_ix in self.term_freq[d]:\n", " count = self.term_freq[d][old_ix]\n", " term_freq[d][ix] = count\n", " ix += 1\n", "\n", " for w in special:\n", " token2idx[w] = len(token2idx)\n", " idx2token[len(idx2token)] = w\n", "\n", " self.token2idx = token2idx\n", " self.idx2token = idx2token\n", " self.term_freq = term_freq\n", " self.vocab_counts = Counter({t.word: t.count for t in self._tokens})\n", "\n", " def _calc_idf(self):\n", " \"\"\"\n", " Compute the (smoothed-) inverse-document frequency for each token in\n", " the corpus.\n", "\n", " For a word token `w`, the IDF is simply\n", "\n", " IDF(w) = log ( |D| / |{ d in D: w in d }| ) + 1\n", "\n", " where D is the set of all documents in the corpus,\n", "\n", " D = {d1, d2, ..., dD}\n", "\n", " If `smooth_idf` is True, we perform additive smoothing on the number of\n", " documents containing a given word, equivalent to pretending that there\n", " exists a final D+1st document that contains every word in the corpus:\n", "\n", " SmoothedIDF(w) = log ( |D| + 1 / [1 + |{ d in D: w in d }|] ) + 1\n", " \"\"\"\n", " inv_doc_freq = {}\n", " smooth_idf = self.hyperparameters[\"smooth_idf\"]\n", " tf, doc_idxs = self.term_freq, self._idx2doc.keys()\n", "\n", " D = len(self._idx2doc) + int(smooth_idf)\n", " for word, w_ix in self.token2idx.items():\n", " d_count = int(smooth_idf)\n", " d_count += np.sum([1 if w_ix in tf[d_ix] else 0 for d_ix in doc_idxs])\n", " inv_doc_freq[w_ix] = 1 if d_count == 0 else np.log(D / d_count) + 1\n", " self.inv_doc_freq = inv_doc_freq\n", "\n", " def transform(self, ignore_special_chars=True):\n", " \"\"\"\n", " Generate the term-frequency inverse-document-frequency encoding of a\n", " text corpus.\n", "\n", " Parameters\n", " ----------\n", " ignore_special_chars : bool\n", " Whether to drop columns corresponding to \"\", \"\", and\n", " \"\" tokens from the final tfidf encoding. Default is True.\n", "\n", " Returns\n", " -------\n", " tfidf : numpy array of shape `(D, M [- 3])`\n", " The encoded corpus, with each row corresponding to a single\n", " document, and each column corresponding to a token id. The mapping\n", " between column numbers and tokens is stored in the `idx2token`\n", " attribute IFF `ignore_special_chars` is False. Otherwise, the\n", " mappings are not accurate.\n", " \"\"\"\n", " D, N = len(self._idx2doc), len(self._tokens)\n", " tf = np.zeros((D, N))\n", " idf = np.zeros((D, N))\n", "\n", " for d_ix in self._idx2doc.keys():\n", " words, counts = zip(*self.term_freq[d_ix].items())\n", " docs = np.ones(len(words), dtype=int) * d_ix\n", " tf[docs, words] = counts\n", "\n", " words = sorted(self.idx2token.keys())\n", " idf = np.tile(np.array([self.inv_doc_freq[w] for w in words]), (D, 1))\n", " tfidf = tf * idf\n", "\n", " if ignore_special_chars:\n", " idxs = [\n", " self.token2idx[\"\"],\n", " self.token2idx[\"\"],\n", " self.token2idx[\"\"],\n", " ]\n", " tfidf = np.delete(tfidf, idxs, 1)\n", "\n", " return tfidf\n", "\n", "\n", "class Vocabulary:\n", " def __init__(\n", " self,\n", " lowercase=True,\n", " min_count=None,\n", " max_tokens=None,\n", " filter_stopwords=True,\n", " filter_punctuation=True,\n", " tokenizer=\"words\",\n", " ):\n", " \"\"\"\n", " An object for compiling and encoding the unique tokens in a text corpus.\n", "\n", " Parameters\n", " ----------\n", " lowercase : bool\n", " Whether to convert each string to lowercase before tokenization.\n", " Default is True.\n", " min_count : int\n", " Minimum number of times a token must occur in order to be included\n", " in vocab. If `None`, include all tokens from `corpus_fp` in vocab.\n", " Default is None.\n", " max_tokens : int\n", " Only add the `max_tokens` most frequent tokens that occur more\n", " than `min_count` to the vocabulary. If None, add all tokens\n", " that occur more than than `min_count`. Default is None.\n", " filter_stopwords : bool\n", " Whether to remove stopwords before encoding the words in the\n", " corpus. Default is True.\n", " filter_punctuation : bool\n", " Whether to remove punctuation before encoding the words in the\n", " corpus. Default is True.\n", " tokenizer : {'whitespace', 'words', 'characters', 'bytes'}\n", " Strategy to follow when mapping strings to tokens. The\n", " `'whitespace'` tokenizer splits strings at whitespace characters.\n", " The `'words'` tokenizer splits strings using a \"word\" regex. The\n", " `'characters'` tokenizer splits strings into individual characters.\n", " The `'bytes'` tokenizer splits strings into a collection of\n", " individual bytes.\n", " \"\"\"\n", " self.hyperparameters = {\n", " \"id\": \"Vocabulary\",\n", " \"encoding\": None,\n", " \"corpus_fps\": None,\n", " \"lowercase\": lowercase,\n", " \"min_count\": min_count,\n", " \"max_tokens\": max_tokens,\n", " \"filter_stopwords\": filter_stopwords,\n", " \"filter_punctuation\": filter_punctuation,\n", " \"tokenizer\": tokenizer,\n", " }\n", "\n", " def __len__(self):\n", " \"\"\"Return the number of tokens in the vocabulary\"\"\"\n", " return len(self._tokens)\n", "\n", " def __iter__(self):\n", " \"\"\"Return an iterator over the tokens in the vocabulary\"\"\"\n", " return iter(self._tokens)\n", "\n", " def __contains__(self, word):\n", " \"\"\"Assert whether `word` is a token in the vocabulary\"\"\"\n", " return word in self.token2idx\n", "\n", " def __getitem__(self, key):\n", " \"\"\"\n", " Return the token (if key is an integer) or the index (if key is a string)\n", " for the key in the vocabulary, if it exists.\n", " \"\"\"\n", " if isinstance(key, str):\n", " return self._tokens[self.token2idx[key]]\n", " if isinstance(key, int):\n", " return self._tokens[key]\n", "\n", " @property\n", " def n_tokens(self):\n", " \"\"\"The number of unique word tokens in the vocabulary\"\"\"\n", " return len(self.token2idx)\n", "\n", " @property\n", " def n_words(self):\n", " \"\"\"The total number of words in the corpus\"\"\"\n", " return sum(self.counts.values())\n", "\n", " @property\n", " def shape(self):\n", " \"\"\"The number of unique word tokens in the vocabulary\"\"\"\n", " return self._tokens.shape\n", "\n", " def most_common(self, n=5):\n", " \"\"\"Return the top `n` most common tokens in the corpus\"\"\"\n", " return self.counts.most_common()[:n]\n", "\n", " def words_with_count(self, k):\n", " \"\"\"Return all tokens that occur `k` times in the corpus\"\"\"\n", " return [w for w, c in self.counts.items() if c == k]\n", "\n", " def filter(self, words, unk=True): # noqa: A003\n", " \"\"\"\n", " Filter (or replace) any word in `words` that is not present in\n", " `Vocabulary`.\n", "\n", " Parameters\n", " ----------\n", " words : list of strs\n", " A list of words to filter\n", " unk : bool\n", " Whether to replace any out of vocabulary words in `words` with the\n", " ```` token (True) or skip them entirely (False). Default is\n", " True.\n", "\n", " Returns\n", " -------\n", " filtered : list of strs\n", " The list of words filtered against the words in Vocabulary.\n", " \"\"\"\n", " if unk:\n", " return [w if w in self else \"\" for w in words]\n", " return [w for w in words if w in self]\n", "\n", " def words_to_indices(self, words):\n", " \"\"\"\n", " Convert the words in `words` to their token indices. If a word is not\n", " in the vocabulary, return the index for the ```` token\n", "\n", " Parameters\n", " ----------\n", " words : list of strs\n", " A list of words to filter\n", "\n", " Returns\n", " -------\n", " indices : list of ints\n", " The token indices for each word in `words`\n", " \"\"\"\n", " unk_ix = self.token2idx[\"\"]\n", " lowercase = self.hyperparameters[\"lowercase\"]\n", " words = [w.lower() for w in words] if lowercase else words\n", " return [self.token2idx[w] if w in self else unk_ix for w in words]\n", "\n", " def indices_to_words(self, indices):\n", " \"\"\"\n", " Convert the indices in `indices` to their word values. If an index is\n", " not in the vocabulary, return the ```` token.\n", "\n", " Parameters\n", " ----------\n", " indices : list of ints\n", " The token indices for each word in `words`\n", "\n", " Returns\n", " -------\n", " words : list of strs\n", " The word strings corresponding to each token index in `indices`\n", " \"\"\"\n", " unk = \"\"\n", " return [self.idx2token[i] if i in self.idx2token else unk for i in indices]\n", "\n", " def fit(self, corpus_fps, encoding=\"utf-8-sig\"):\n", " \"\"\"\n", " Compute the vocabulary across a collection of documents.\n", "\n", " Parameters\n", " ----------\n", " corpus_fps : str or list of strs\n", " The filepath / list of filepaths for the document(s) to be encoded.\n", " Each document is expected to be encoded as newline-separated\n", " string of text, with adjacent tokens separated by a whitespace\n", " character.\n", " encoding : str\n", " Specifies the text encoding for corpus. Common entries are either\n", " 'utf-8' (no header byte), or 'utf-8-sig' (header byte). Default is\n", " 'utf-8-sig'.\n", "\n", " Returns\n", " -------\n", " self\n", " \"\"\"\n", " if isinstance(corpus_fps, str):\n", " corpus_fps = [corpus_fps]\n", "\n", " for corpus_fp in corpus_fps:\n", " assert op.isfile(corpus_fp), \"{} does not exist\".format(corpus_fp)\n", "\n", " tokens = []\n", " H = self.hyperparameters\n", " idx2word, word2idx = {}, {}\n", "\n", " tokenizer_dict = {\n", " \"words\": tokenize_words,\n", " \"characters\": tokenize_chars,\n", " \"whitespace\": tokenize_whitespace,\n", " \"bytes\": tokenize_bytes_raw,\n", " }\n", "\n", " min_count = H[\"min_count\"]\n", " lowercase = H[\"lowercase\"]\n", " max_tokens = H[\"max_tokens\"]\n", " filter_stop = H[\"filter_stopwords\"]\n", " filter_punc = H[\"filter_punctuation\"]\n", " tokenizer = tokenizer_dict[H[\"tokenizer\"]]\n", "\n", " H[\"encoding\"] = encoding\n", " H[\"corpus_fps\"] = corpus_fps\n", "\n", " # encode special tokens\n", " for tt in [\"\", \"\", \"\"]:\n", " word2idx[tt] = len(tokens)\n", " idx2word[len(tokens)] = tt\n", " tokens.append(Token(tt))\n", "\n", " bol_ix = word2idx[\"\"]\n", " eol_ix = word2idx[\"\"]\n", "\n", " for d_ix, doc_fp in enumerate(corpus_fps):\n", " with open(doc_fp, \"r\", encoding=H[\"encoding\"]) as doc:\n", " for line in doc:\n", " words = tokenizer(\n", " line,\n", " lowercase=lowercase,\n", " filter_stopwords=filter_stop,\n", " filter_punctuation=filter_punc,\n", " encoding=H[\"encoding\"],\n", " )\n", "\n", " for ww in words:\n", " if ww not in word2idx:\n", " word2idx[ww] = len(tokens)\n", " idx2word[len(tokens)] = ww\n", " tokens.append(Token(ww))\n", "\n", " t_idx = word2idx[ww]\n", " tokens[t_idx].count += 1\n", "\n", " # wrap line in and tags\n", " tokens[bol_ix].count += 1\n", " tokens[eol_ix].count += 1\n", "\n", " self._tokens = tokens\n", " self.token2idx = word2idx\n", " self.idx2token = idx2word\n", "\n", " # replace all words occurring less than `min_count` by \n", " if min_count is not None:\n", " self._drop_low_freq_tokens()\n", "\n", " # retain only the top `max_tokens` most frequent tokens, coding\n", " # everything else as \n", " if max_tokens is not None and len(tokens) > max_tokens:\n", " self._keep_top_n_tokens()\n", "\n", " counts = {w: self._tokens[ix].count for w, ix in self.token2idx.items()}\n", " self.counts = Counter(counts)\n", " self._tokens = np.array(self._tokens)\n", " return self\n", "\n", " def _keep_top_n_tokens(self):\n", " word2idx, idx2word = {}, {}\n", " N = self.hyperparameters[\"max_tokens\"]\n", " tokens = sorted(self._tokens, key=lambda x: x.count, reverse=True)\n", "\n", " # reindex the top-N tokens...\n", " unk_ix = None\n", " for idx, tt in enumerate(tokens[:N]):\n", " word2idx[tt.word] = idx\n", " idx2word[idx] = tt.word\n", "\n", " if tt.word == \"\":\n", " unk_ix = idx\n", "\n", " # ... if isn't in the top-N, add it, replacing the Nth\n", " # most-frequent word and adjusting the count accordingly ...\n", " if unk_ix is None:\n", " unk_ix = self.token2idx[\"\"]\n", " old_count = tokens[N - 1].count\n", " tokens[N - 1] = self._tokens[unk_ix]\n", " tokens[N - 1].count += old_count\n", " word2idx[\"\"] = N - 1\n", " idx2word[N - 1] = \"\"\n", "\n", " # ... and recode all dropped tokens as \"\"\n", " for tt in tokens[N:]:\n", " tokens[unk_ix].count += tt.count\n", "\n", " self._tokens = tokens[:N]\n", " self.token2idx = word2idx\n", " self.idx2token = idx2word\n", "\n", " assert len(self._tokens) <= N\n", "\n", " def _drop_low_freq_tokens(self):\n", " \"\"\"\n", " Replace all tokens that occur less than `min_count` with the ``\n", " token.\n", " \"\"\"\n", " unk_idx = 0\n", " unk_token = self._tokens[self.token2idx[\"\"]]\n", " eol_token = self._tokens[self.token2idx[\"\"]]\n", " bol_token = self._tokens[self.token2idx[\"\"]]\n", "\n", " H = self.hyperparameters\n", " tokens = [unk_token, eol_token, bol_token]\n", " word2idx = {\"\": 0, \"\": 1, \"\": 2}\n", " idx2word = {0: \"\", 1: \"\", 2: \"\"}\n", " special = {\"\", \"\", \"\"}\n", "\n", " for tt in self._tokens:\n", " if tt.word not in special:\n", " if tt.count < H[\"min_count\"]:\n", " tokens[unk_idx].count += tt.count\n", " else:\n", " word2idx[tt.word] = len(tokens)\n", " idx2word[len(tokens)] = tt.word\n", " tokens.append(tt)\n", "\n", " self._tokens = tokens\n", " self.token2idx = word2idx\n", " self.idx2token = idx2word\n"]} {"path": "numpy_ml/preprocessing/general.py", "content": ["import json\n", "import hashlib\n", "import warnings\n", "\n", "import numpy as np\n", "\n", "try:\n", " from scipy.sparse import csr_matrix\n", "\n", " _SCIPY = True\n", "except ImportError:\n", " warnings.warn(\"Scipy not installed. FeatureHasher can only create dense matrices\")\n", " _SCIPY = False\n", "\n", "\n", "def minibatch(X, batchsize=256, shuffle=True):\n", " \"\"\"\n", " Compute the minibatch indices for a training dataset.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, \\*)`\n", " The dataset to divide into minibatches. Assumes the first dimension\n", " represents the number of training examples.\n", " batchsize : int\n", " The desired size of each minibatch. Note, however, that if ``X.shape[0] %\n", " batchsize > 0`` then the final batch will contain fewer than batchsize\n", " entries. Default is 256.\n", " shuffle : bool\n", " Whether to shuffle the entries in the dataset before dividing into\n", " minibatches. Default is True.\n", "\n", " Returns\n", " -------\n", " mb_generator : generator\n", " A generator which yields the indices into `X` for each batch.\n", " n_batches: int\n", " The number of batches.\n", " \"\"\"\n", " N = X.shape[0]\n", " ix = np.arange(N)\n", " n_batches = int(np.ceil(N / batchsize))\n", "\n", " if shuffle:\n", " np.random.shuffle(ix)\n", "\n", " def mb_generator():\n", " for i in range(n_batches):\n", " yield ix[i * batchsize : (i + 1) * batchsize]\n", "\n", " return mb_generator(), n_batches\n", "\n", "\n", "class OneHotEncoder:\n", " def __init__(self):\n", " \"\"\"\n", " Convert between category labels and their one-hot vector\n", " representations.\n", "\n", " Parameters\n", " ----------\n", " categories : list of length `C`\n", " List of the unique category labels for the items to encode.\n", " \"\"\"\n", " self._is_fit = False\n", " self.hyperparameters = {}\n", " self.parameters = {\"categories\": None}\n", "\n", " def __call__(self, labels):\n", " return self.transform(labels)\n", "\n", " def fit(self, categories):\n", " \"\"\"\n", " Create mappings between columns and category labels.\n", "\n", " Parameters\n", " ----------\n", " categories : list of length `C`\n", " List of the unique category labels for the items to encode.\n", " \"\"\"\n", " self.parameters[\"categories\"] = categories\n", " self.cat2idx = {c: i for i, c in enumerate(categories)}\n", " self.idx2cat = {i: c for i, c in enumerate(categories)}\n", " self._is_fit = True\n", "\n", " def transform(self, labels, categories=None):\n", " \"\"\"\n", " Convert a list of labels into a one-hot encoding.\n", "\n", " Parameters\n", " ----------\n", " labels : list of length `N`\n", " A list of category labels.\n", " categories : list of length `C`\n", " List of the unique category labels for the items to encode. Default\n", " is None.\n", "\n", " Returns\n", " -------\n", " Y : :py:class:`ndarray ` of shape `(N, C)`\n", " The one-hot encoded labels. Each row corresponds to an example,\n", " with a single 1 in the column corresponding to the respective\n", " label.\n", " \"\"\"\n", " if not self._is_fit:\n", " categories = set(labels) if categories is None else categories\n", " self.fit(categories)\n", "\n", " unknown = list(set(labels) - set(self.cat2idx.keys()))\n", " assert len(unknown) == 0, \"Unrecognized label(s): {}\".format(unknown)\n", "\n", " N, C = len(labels), len(self.cat2idx)\n", " cols = np.array([self.cat2idx[c] for c in labels])\n", "\n", " Y = np.zeros((N, C))\n", " Y[np.arange(N), cols] = 1\n", " return Y\n", "\n", " def inverse_transform(self, Y):\n", " \"\"\"\n", " Convert a one-hot encoding back into the corresponding labels\n", "\n", " Parameters\n", " ----------\n", " Y : :py:class:`ndarray ` of shape `(N, C)`\n", " One-hot encoded labels. Each row corresponds to an example, with a\n", " single 1 in the column associated with the label for that example\n", "\n", " Returns\n", " -------\n", " labels : list of length `N`\n", " The list of category labels corresponding to the nonzero columns in\n", " `Y`\n", " \"\"\"\n", " C = len(self.cat2idx)\n", " assert Y.ndim == 2, \"Y must be 2D, but has shape {}\".format(Y.shape)\n", " assert Y.shape[1] == C, \"Y must have {} columns, got {}\".format(C, Y.shape[1])\n", " return [self.idx2cat[ix] for ix in Y.nonzero()[1]]\n", "\n", "\n", "class Standardizer:\n", " def __init__(self, with_mean=True, with_std=True):\n", " \"\"\"\n", " Feature-wise standardization for vector inputs.\n", "\n", " Notes\n", " -----\n", " Due to the sensitivity of empirical mean and standard deviation\n", " calculations to extreme values, `Standardizer` cannot guarantee\n", " balanced feature scales in the presence of outliers. In particular,\n", " note that because outliers for each feature can have different\n", " magnitudes, the spread of the transformed data on each feature can be\n", " very different.\n", "\n", " Similar to sklearn, `Standardizer` uses a biased estimator for the\n", " standard deviation: ``numpy.std(x, ddof=0)``.\n", "\n", " Parameters\n", " ----------\n", " with_mean : bool\n", " Whether to scale samples to have 0 mean during transformation.\n", " Default is True.\n", " with_std : bool\n", " Whether to scale samples to have unit variance during\n", " transformation. Default is True.\n", " \"\"\"\n", " self.with_mean = with_mean\n", " self.with_std = with_std\n", " self._is_fit = False\n", "\n", " @property\n", " def hyperparameters(self):\n", " H = {\"with_mean\": self.with_mean, \"with_std\": self.with_std}\n", " return H\n", "\n", " @property\n", " def parameters(self):\n", " params = {\n", " \"mean\": self._mean if hasattr(self, \"mean\") else None,\n", " \"std\": self._std if hasattr(self, \"std\") else None,\n", " }\n", " return params\n", "\n", " def __call__(self, X):\n", " return self.transform(X)\n", "\n", " def fit(self, X):\n", " \"\"\"\n", " Store the feature-wise mean and standard deviation across the samples\n", " in `X` for future scaling.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " An array of N samples, each with dimensionality `C`\n", " \"\"\"\n", " if not isinstance(X, np.ndarray):\n", " X = np.array(X)\n", "\n", " if X.shape[0] < 2:\n", " raise ValueError(\"`X` must contain at least 2 samples\")\n", "\n", " std = np.ones(X.shape[1])\n", " mean = np.zeros(X.shape[1])\n", "\n", " if self.with_mean:\n", " mean = np.mean(X, axis=0)\n", "\n", " if self.with_std:\n", " std = np.std(X, axis=0, ddof=0)\n", "\n", " self._mean = mean\n", " self._std = std\n", " self._is_fit = True\n", "\n", " def transform(self, X):\n", " \"\"\"\n", " Standardize features by removing the mean and scaling to unit variance.\n", "\n", " For a sample `x`, the standardized score is calculated as:\n", "\n", " .. math::\n", "\n", " z = (x - u) / s\n", "\n", " where `u` is the mean of the training samples or zero if `with_mean` is\n", " False, and `s` is the standard deviation of the training samples or 1\n", " if `with_std` is False.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " An array of N samples, each with dimensionality `C`.\n", "\n", " Returns\n", " -------\n", " Z : :py:class:`ndarray ` of shape `(N, C)`\n", " The feature-wise standardized version of `X`.\n", " \"\"\"\n", " if not self._is_fit:\n", " raise Exception(\"Must call `fit` before using the `transform` method\")\n", " return (X - self._mean) / self._std\n", "\n", " def inverse_transform(self, Z):\n", " \"\"\"\n", " Convert a collection of standardized features back into the original\n", " feature space.\n", "\n", " For a standardized sample `z`, the unstandardized score is calculated as:\n", "\n", " .. math::\n", "\n", " x = z s + u\n", "\n", " where `u` is the mean of the training samples or zero if `with_mean` is\n", " False, and `s` is the standard deviation of the training samples or 1\n", " if `with_std` is False.\n", "\n", " Parameters\n", " ----------\n", " Z : :py:class:`ndarray ` of shape `(N, C)`\n", " An array of `N` standardized samples, each with dimensionality `C`.\n", "\n", " Returns\n", " -------\n", " X : :py:class:`ndarray ` of shape `(N, C)`\n", " The unstandardixed samples from `Z`.\n", " \"\"\"\n", " assert self._is_fit, \"Must fit `Standardizer` before calling inverse_transform\"\n", " P = self.parameters\n", " mean, std = P[\"mean\"], P[\"std\"]\n", " return Z * std + mean\n", "\n", "\n", "class FeatureHasher:\n", " def __init__(self, n_dim=256, sparse=True):\n", " \"\"\"\n", " Convert a collection of features to a fixed-dimensional matrix using\n", " the hashing trick.\n", "\n", " Notes\n", " -----\n", " Uses the md5 hash.\n", "\n", " Parameters\n", " ----------\n", " n_dim : int\n", " The dimensionality of each example in the output feature matrix.\n", " Small numbers of features are likely to cause hash collisions, but\n", " large numbers will cause larger overall parameter dimensions for\n", " any (linear) learning agent. Default is 256.\n", " sparse : bool\n", " Whether the resulting feature matrix should be a sparse\n", " :py:class:`csr_matrix ` or dense\n", " :py:class:`ndarray `. Default is True.\n", " \"\"\"\n", " self.n_dim = n_dim\n", " self.hash = hashlib.md5\n", " self.sparse = sparse and _SCIPY\n", "\n", " def encode(self, examples):\n", " \"\"\"\n", " Encode a collection of multi-featured examples into a\n", " `n_dim`-dimensional feature matrix via feature hashing.\n", "\n", " Notes\n", " -----\n", " Feature hashing works by applying a hash function to the features of an\n", " example and using the hash values as column indices in the resulting\n", " feature matrix. The entries at each hashed feature column correspond to\n", " the values for that example and feature. For example, given the\n", " following two input examples:\n", "\n", " >>> examples = [\n", " {\"furry\": 1, \"quadruped\": 1, \"domesticated\": 1},\n", " {\"nocturnal\": 1, \"quadruped\": 1},\n", " ]\n", "\n", " and a hypothetical hash function `H` mapping strings to [0, 127], we have:\n", "\n", " >>> feature_mat = zeros(2, 128)\n", " >>> ex1_cols = [H(\"furry\"), H(\"quadruped\"), H(\"domesticated\")]\n", " >>> ex2_cols = [H(\"nocturnal\"), H(\"quadruped\")]\n", " >>> feat_mat[0, ex1_cols] = 1\n", " >>> feat_mat[1, ex2_cols] = 1\n", "\n", " To better handle hash collisions, it is common to multiply the feature\n", " value by the sign of the digest for the corresponding feature name.\n", "\n", " Parameters\n", " ----------\n", " examples : dict or list of dicts\n", " A collection of `N` examples, each represented as a dict where keys\n", " correspond to the feature name and values correspond to the feature\n", " value.\n", "\n", " Returns\n", " -------\n", " table : :py:class:`ndarray ` or :py:class:`csr_matrix ` of shape `(N, n_dim)`\n", " The encoded feature matrix\n", " \"\"\"\n", " if isinstance(examples, dict):\n", " examples = [examples]\n", "\n", " sparse = self.sparse\n", " return self._encode_sparse(examples) if sparse else self._encode_dense(examples)\n", "\n", " def _encode_dense(self, examples):\n", " N = len(examples)\n", " table = np.zeros(N, self.n_dim) # dense\n", "\n", " for row, feat_dict in enumerate(examples):\n", " for f_id, val in feat_dict.items():\n", " if isinstance(f_id, str):\n", " f_id = f_id.encode(\"utf-8\")\n", "\n", " # use json module to convert the feature id into a unique\n", " # string compatible with the buffer API (required by hashlib)\n", " if isinstance(f_id, (tuple, dict, list)):\n", " f_id = json.dumps(f_id, sort_keys=True).encode(\"utf-8\")\n", "\n", " h = int(self.hash(f_id).hexdigest(), base=16)\n", " col = h % self.n_dim\n", " table[row, col] += np.sign(h) * val\n", "\n", " return table\n", "\n", " def _encode_sparse(self, examples):\n", " N = len(examples)\n", " idxs, data = [], []\n", "\n", " for row, feat_dict in enumerate(examples):\n", " for f_id, val in feat_dict.items():\n", " if isinstance(f_id, str):\n", " f_id = f_id.encode(\"utf-8\")\n", "\n", " # use json module to convert the feature id into a unique\n", " # string compatible with the buffer API (required by hashlib)\n", " if isinstance(f_id, (tuple, dict, list)):\n", " f_id = json.dumps(f_id, sort_keys=True).encode(\"utf-8\")\n", "\n", " h = int(self.hash(f_id).hexdigest(), base=16)\n", " col = h % self.n_dim\n", " idxs.append((row, col))\n", " data.append(np.sign(h) * val)\n", "\n", " table = csr_matrix((data, zip(*idxs)), shape=(N, self.n_dim))\n", " return table\n"]} {"path": "numpy_ml/hmm/__init__.py", "content": ["from .hmm import *\n"]} {"path": "numpy_ml/hmm/hmm.py", "content": ["\"\"\"Hidden Markov model module\"\"\"\n", "\n", "import numpy as np\n", "from numpy_ml.utils.misc import logsumexp\n", "\n", "\n", "class MultinomialHMM:\n", " def __init__(self, A=None, B=None, pi=None, eps=None):\n", " r\"\"\"\n", " A simple hidden Markov model with multinomial emission distribution.\n", "\n", " Parameters\n", " ----------\n", " A : :py:class:`ndarray ` of shape `(N, N)` or None\n", " The transition matrix between latent states in the HMM. Index `i`,\n", " `j` gives the probability of transitioning from latent state `i` to\n", " latent state `j`. Default is None.\n", " B : :py:class:`ndarray ` of shape `(N, V)` or None\n", " The emission matrix. Entry `i`, `j` gives the probability of latent\n", " state i emitting an observation of type `j`. Default is None.\n", " pi : :py:class:`ndarray ` of shape `(N,)` or None\n", " The prior probability of each latent state. If None, use a uniform\n", " prior over states. Default is None.\n", " eps : float or None\n", " Epsilon value to avoid :math:`\\log(0)` errors. If None, defaults to\n", " the machine epsilon. Default is None.\n", "\n", " Attributes\n", " ----------\n", " A : :py:class:`ndarray ` of shape `(N, N)`\n", " The transition matrix between latent states in the HMM. Index `i`,\n", " `j` gives the probability of transitioning from latent state `i` to\n", " latent state `j`.\n", " B : :py:class:`ndarray ` of shape `(N, V)`\n", " The emission matrix. Entry `i`, `j` gives the probability of latent\n", " state `i` emitting an observation of type `j`.\n", " N : int\n", " The number of unique latent states\n", " V : int\n", " The number of unique observation types\n", " O : :py:class:`ndarray ` of shape `(I, T)`\n", " The collection of observed training sequences.\n", " I : int\n", " The number of sequences in `O`.\n", " T : int\n", " The number of observations in each sequence in `O`.\n", " \"\"\"\n", " eps = np.finfo(float).eps if eps is None else eps\n", "\n", " # prior probability of each latent state\n", " if pi is not None:\n", " pi[pi == 0] = eps\n", "\n", " # number of latent state types\n", " N = None\n", " if A is not None:\n", " N = A.shape[0]\n", " A[A == 0] = eps\n", "\n", " # number of observation types\n", " V = None\n", " if B is not None:\n", " V = B.shape[1]\n", " B[B == 0] = eps\n", "\n", " self.parameters = {\n", " \"A\": A, # transition matrix\n", " \"B\": B, # emission matrix\n", " \"pi\": pi, # prior probability of each latent state\n", " }\n", "\n", " self.hyperparameters = {\n", " \"eps\": eps, # epsilon\n", " }\n", "\n", " self.derived_variables = {\n", " \"N\": N, # number of latent state types\n", " \"V\": V, # number of observation types\n", " }\n", "\n", " def generate(self, n_steps, latent_state_types, obs_types):\n", " \"\"\"\n", " Sample a sequence from the HMM.\n", "\n", " Parameters\n", " ----------\n", " n_steps : int\n", " The length of the generated sequence\n", " latent_state_types : :py:class:`ndarray ` of shape `(N,)`\n", " A collection of labels for the latent states\n", " obs_types : :py:class:`ndarray ` of shape `(V,)`\n", " A collection of labels for the observations\n", "\n", " Returns\n", " -------\n", " states : :py:class:`ndarray ` of shape `(n_steps,)`\n", " The sampled latent states.\n", " emissions : :py:class:`ndarray ` of shape `(n_steps,)`\n", " The sampled emissions.\n", " \"\"\"\n", " P = self.parameters\n", " A, B, pi = P[\"A\"], P[\"B\"], P[\"pi\"]\n", "\n", " # sample the initial latent state\n", " s = np.random.multinomial(1, pi).argmax()\n", " states = [latent_state_types[s]]\n", "\n", " # generate an emission given latent state\n", " v = np.random.multinomial(1, B[s, :]).argmax()\n", " emissions = [obs_types[v]]\n", "\n", " # sample a latent transition, rinse, and repeat\n", " for i in range(n_steps - 1):\n", " s = np.random.multinomial(1, A[s, :]).argmax()\n", " states.append(latent_state_types[s])\n", "\n", " v = np.random.multinomial(1, B[s, :]).argmax()\n", " emissions.append(obs_types[v])\n", "\n", " return np.array(states), np.array(emissions)\n", "\n", " def log_likelihood(self, O):\n", " r\"\"\"\n", " Given the HMM parameterized by :math:`(A`, B, \\pi)` and an observation\n", " sequence `O`, compute the marginal likelihood of `O`,\n", " :math:`P(O \\mid A,B,\\pi)`, by marginalizing over latent states.\n", "\n", " Notes\n", " -----\n", " The log likelihood is computed efficiently via DP using the forward\n", " algorithm, which produces a 2D trellis, ``forward`` (sometimes referred\n", " to as `alpha` in the literature), where entry `i`, `j` represents the\n", " probability under the HMM of being in latent state `i` after seeing the\n", " first `j` observations:\n", "\n", " .. math::\n", "\n", " \\mathtt{forward[i,j]} = P(o_1, \\ldots, o_j, q_j=i \\mid A, B, \\pi)\n", "\n", " Here :math:`q_j = i` indicates that the hidden state at time `j` is of\n", " type `i`.\n", "\n", " The DP step is:\n", "\n", " .. math::\n", "\n", " \\mathtt{forward[i,j]}\n", " &= \\sum_{s'=1}^N \\mathtt{forward[s',j-1]} \\cdot\n", " \\mathtt{A[s',i]} \\cdot \\mathtt{B[i,o_j]} \\\\\n", " &= \\sum_{s'=1}^N P(o_1, \\ldots, o_{j-1}, q_{j-1}=s' \\mid A, B, \\pi)\n", " P(q_j=i \\mid q_{j-1}=s') P(o_j \\mid q_j=i)\n", "\n", " In words, ``forward[i,j]`` is the weighted sum of the values computed on\n", " the previous timestep. The weight on each previous state value is the\n", " product of the probability of transitioning from that state to state `i`\n", " and the probability of emitting observation `j` in state `i`.\n", "\n", " Parameters\n", " ----------\n", " O : :py:class:`ndarray ` of shape `(1, T)`\n", " A single set of observations.\n", "\n", " Returns\n", " -------\n", " likelihood : float\n", " The likelihood of the observations `O` under the HMM.\n", " \"\"\"\n", " if O.ndim == 1:\n", " O = O.reshape(1, -1) # noqa: E741\n", "\n", " I, T = O.shape # noqa: E741\n", "\n", " if I != 1: # noqa: E741\n", " raise ValueError(\"Likelihood only accepts a single sequence\")\n", "\n", " forward = self._forward(O[0])\n", " log_likelihood = logsumexp(forward[:, T - 1])\n", " return log_likelihood\n", "\n", " def decode(self, O):\n", " r\"\"\"\n", " Given the HMM parameterized by :math:`(A, B, \\pi)` and an observation\n", " sequence :math:`O = o_1, \\ldots, o_T`, compute the most probable\n", " sequence of latent states, :math:`Q = q_1, \\ldots, q_T`.\n", "\n", " Notes\n", " -----\n", " HMM decoding is done efficiently via DP using the Viterbi algorithm,\n", " which produces a 2D trellis, ``viterbi``, where entry `i`, `j` represents the\n", " probability under the HMM of being in state `i` at time `j` after having\n", " passed through the *most probable* state sequence :math:`q_1,\\ldots,q_{j-1}`:\n", "\n", " .. math::\n", "\n", " \\mathtt{viterbi[i,j]} =\n", " \\max_{q_1, \\ldots, q_{j-1}}\n", " P(o_1, \\ldots, o_j, q_1, \\ldots, q_{j-1}, q_j=i \\mid A, B, \\pi)\n", "\n", " Here :math:`q_j = i` indicates that the hidden state at time `j` is of\n", " type `i`, and :math:`\\max_{q_1,\\ldots,q_{j-1}}` represents the maximum over\n", " all possible latent state sequences for the first `j-1` observations.\n", "\n", " The DP step is:\n", "\n", " .. math::\n", "\n", " \\mathtt{viterbi[i,j]} &=\n", " \\max_{s'=1}^N \\mathtt{viterbi[s',j-1]} \\cdot\n", " \\mathtt{A[s',i]} \\cdot \\mathtt{B[i,o_j]} \\\\\n", " &= \\max_{s'=1}^N\n", " P(o_1,\\ldots, o_j, q_1, \\ldots, q_{j-1}, q_j=i \\mid A, B, \\pi)\n", " P(q_j=i \\mid q_{j-1}=s') P(o_j \\mid q_j=i)\n", "\n", " In words, ``viterbi[i,j]`` is the weighted sum of the values computed\n", " on the previous timestep. The weight on each value is the product of\n", " the probability of transitioning from that state to state `i` and the\n", " probability of emitting observation `j` in state `i`.\n", "\n", " To compute the most probable state sequence we maintain a second\n", " trellis, ``back_pointer``, whose `i`, `j` entry contains the value of the\n", " latent state at timestep `j-1` that is most likely to lead to latent\n", " state `i` at timestep `j`.\n", "\n", " When we have completed the ``viterbi`` and ``back_pointer`` trellises for\n", " all `T` timseteps/observations, we greedily move backwards through the\n", " ``back_pointer`` trellis to construct the best path for the full\n", " sequence of observations.\n", "\n", " Parameters\n", " ----------\n", " O : :py:class:`ndarray ` of shape `(T,)`\n", " An observation sequence of length `T`.\n", "\n", " Returns\n", " -------\n", " best_path : list of length `T`\n", " The most probable sequence of latent states for observations `O`.\n", " best_path_prob : float\n", " The probability of the latent state sequence in `best_path` under\n", " the HMM.\n", " \"\"\"\n", " P = self.parameters\n", " N = self.derived_variables[\"N\"]\n", " eps = self.hyperparameters[\"eps\"]\n", " A, B, pi = P[\"A\"], P[\"B\"], P[\"pi\"]\n", "\n", " if O.ndim == 1:\n", " O = O.reshape(1, -1) # noqa: E741\n", "\n", " # number of observations in each sequence\n", " T = O.shape[1]\n", "\n", " # number of training sequences\n", " I = O.shape[0] # noqa: E741\n", " if I != 1: # noqa: E741\n", " raise ValueError(\"Can only decode a single sequence (O.shape[0] must be 1)\")\n", "\n", " # initialize the viterbi and back_pointer matrices\n", " viterbi = np.zeros((N, T))\n", " back_pointer = np.zeros((N, T)).astype(int)\n", "\n", " ot = O[0, 0]\n", " for s in range(N):\n", " back_pointer[s, 0] = 0\n", " viterbi[s, 0] = np.log(pi[s] + eps) + np.log(B[s, ot] + eps)\n", "\n", " for t in range(1, T):\n", " ot = O[0, t]\n", " for s in range(N):\n", " seq_probs = [\n", " viterbi[s_, t - 1] + np.log(A[s_, s] + eps) + np.log(B[s, ot] + eps)\n", " for s_ in range(N)\n", " ]\n", "\n", " viterbi[s, t] = np.max(seq_probs)\n", " back_pointer[s, t] = np.argmax(seq_probs)\n", "\n", " best_path_log_prob = viterbi[:, T - 1].max()\n", "\n", " # backtrack through the trellis to get the most likely sequence of\n", " # latent states\n", " pointer = viterbi[:, T - 1].argmax()\n", " best_path = [pointer]\n", " for t in reversed(range(1, T)):\n", " pointer = back_pointer[pointer, t]\n", " best_path.append(pointer)\n", " best_path = best_path[::-1]\n", "\n", " return best_path, best_path_log_prob\n", "\n", " def _forward(self, Obs):\n", " r\"\"\"\n", " Computes the forward probability trellis for an HMM parameterized by\n", " :math:`(A, B, \\pi)`.\n", "\n", " Notes\n", " -----\n", " The forward trellis (sometimes referred to as `alpha` in the HMM\n", " literature), is a 2D array where entry `i`, `j` represents the probability\n", " under the HMM of being in latent state `i` after seeing the first `j`\n", " observations:\n", "\n", " .. math::\n", "\n", " \\mathtt{forward[i,j]} =\n", " P(o_1, \\ldots, o_j, q_j=i \\mid A, B, \\pi)\n", "\n", " Here :math:`q_j = i` indicates that the hidden state at time `j` is of\n", " type `i`.\n", "\n", " The DP step is::\n", "\n", " .. math::\n", "\n", " forward[i,j] &=\n", " \\sum_{s'=1}^N forward[s',j-1] \\times A[s',i] \\times B[i,o_j] \\\\\n", " &= \\sum_{s'=1}^N P(o_1, \\ldots, o_{j-1}, q_{j-1}=s' \\mid A, B, \\pi)\n", " \\times P(q_j=i \\mid q_{j-1}=s') \\times P(o_j \\mid q_j=i)\n", "\n", " In words, ``forward[i,j]`` is the weighted sum of the values computed\n", " on the previous timestep. The weight on each previous state value is\n", " the product of the probability of transitioning from that state to\n", " state `i` and the probability of emitting observation `j` in state `i`.\n", "\n", " Parameters\n", " ----------\n", " Obs : :py:class:`ndarray ` of shape `(T,)`\n", " An observation sequence of length `T`.\n", "\n", " Returns\n", " -------\n", " forward : :py:class:`ndarray ` of shape `(N, T)`\n", " The forward trellis.\n", " \"\"\"\n", " P = self.parameters\n", " N = self.derived_variables[\"N\"]\n", " eps = self.hyperparameters[\"eps\"]\n", " A, B, pi = P[\"A\"], P[\"B\"], P[\"pi\"]\n", "\n", " T = Obs.shape[0]\n", "\n", " # initialize the forward probability matrix\n", " forward = np.zeros((N, T))\n", "\n", " ot = Obs[0]\n", " for s in range(N):\n", " forward[s, 0] = np.log(pi[s] + eps) + np.log(B[s, ot] + eps)\n", "\n", " for t in range(1, T):\n", " ot = Obs[t]\n", " for s in range(N):\n", " forward[s, t] = logsumexp(\n", " [\n", " forward[s_, t - 1]\n", " + np.log(A[s_, s] + eps)\n", " + np.log(B[s, ot] + eps)\n", " for s_ in range(N)\n", " ] # noqa: C812\n", " )\n", " return forward\n", "\n", " def _backward(self, Obs):\n", " r\"\"\"\n", " Compute the backward probability trellis for an HMM parameterized by\n", " :math:`(A, B, \\pi)`.\n", "\n", " Notes\n", " -----\n", " The backward trellis (sometimes referred to as `beta` in the HMM\n", " literature), is a 2D array where entry `i`,`j` represents the probability\n", " of seeing the observations from time `j+1` onward given that the HMM is\n", " in state `i` at time `j`\n", "\n", " .. math::\n", "\n", " \\mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},\\ldots,o_T \\mid q_j=i,A,B,\\pi)\n", "\n", " Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`.\n", "\n", " The DP step is::\n", "\n", " backward[i,j] &=\n", " \\sum_{s'=1}^N backward[s',j+1] \\times A[i, s'] \\times B[s',o_{j+1}] \\\\\n", " &= \\sum_{s'=1}^N P(o_{j+1}, o_{j+2}, \\ldots, o_T \\mid q_j=i, A, B, pi)\n", " \\times P(q_{j+1}=s' \\mid q_{j}=i) \\times P(o_{j+1} \\mid q_{j+1}=s')\n", "\n", " In words, ``backward[i,j]`` is the weighted sum of the values computed\n", " on the following timestep. The weight on each state value from the\n", " `j+1`'th timestep is the product of the probability of transitioning from\n", " state i to that state and the probability of emitting observation `j+1`\n", " from that state.\n", "\n", " Parameters\n", " ----------\n", " Obs : :py:class:`ndarray ` of shape `(T,)`\n", " A single observation sequence of length `T`.\n", "\n", " Returns\n", " -------\n", " backward : :py:class:`ndarray ` of shape `(N, T)`\n", " The backward trellis.\n", " \"\"\"\n", " P = self.parameters\n", " A, B = P[\"A\"], P[\"B\"]\n", " N = self.derived_variables[\"N\"]\n", " eps = self.hyperparameters[\"eps\"]\n", "\n", " T = Obs.shape[0]\n", "\n", " # initialize the backward trellis\n", " backward = np.zeros((N, T))\n", "\n", " for s in range(N):\n", " backward[s, T - 1] = 0\n", "\n", " for t in reversed(range(T - 1)):\n", " ot1 = Obs[t + 1]\n", " for s in range(N):\n", " backward[s, t] = logsumexp(\n", " [\n", " np.log(A[s, s_] + eps)\n", " + np.log(B[s_, ot1] + eps)\n", " + backward[s_, t + 1]\n", " for s_ in range(N)\n", " ] # noqa: C812\n", " )\n", " return backward\n", "\n", " def _initialize_parameters(self):\n", " P = self.parameters\n", " A, B, pi = P[\"A\"], P[\"B\"], P[\"pi\"]\n", " N, V = self.derived_variables[\"N\"], self.derived_variables[\"V\"]\n", "\n", " # Uniform initialization of prior over latent states\n", " if pi is None:\n", " pi = np.ones(N)\n", " pi = pi / pi.sum()\n", "\n", " # Uniform initialization of A\n", " if A is None:\n", " A = np.ones((N, N))\n", " A = A / A.sum(axis=1)[:, None]\n", "\n", " # Random initialization of B\n", " if B is None:\n", " B = np.random.rand(N, V)\n", " B = B / B.sum(axis=1)[:, None]\n", "\n", " P[\"A\"], P[\"B\"], P[\"pi\"] = A, B, pi\n", "\n", " def fit(\n", " self,\n", " O,\n", " latent_state_types,\n", " observation_types,\n", " pi=None,\n", " tol=1e-5,\n", " verbose=False,\n", " ):\n", " \"\"\"\n", " Given an observation sequence `O` and the set of possible latent states,\n", " learn the MLE HMM parameters `A` and `B`.\n", "\n", " Notes\n", " -----\n", " Model fitting is done iterativly using the Baum-Welch/Forward-Backward\n", " algorithm, a special case of the EM algorithm.\n", "\n", " We begin with an intial estimate for the transition (`A`) and emission\n", " (`B`) matrices and then use these to derive better and better estimates\n", " by computing the forward probability for an observation and then\n", " dividing that probability mass among all the paths that contributed to\n", " it.\n", "\n", " Parameters\n", " ----------\n", " O : :py:class:`ndarray ` of shape `(I, T)`\n", " The set of `I` training observations, each of length `T`.\n", " latent_state_types : list of length `N`\n", " The collection of valid latent states.\n", " observation_types : list of length `V`\n", " The collection of valid observation states.\n", " pi : :py:class:`ndarray ` of shape `(N,)`\n", " The prior probability of each latent state. If None, assume each\n", " latent state is equally likely a priori. Default is None.\n", " tol : float\n", " The tolerance value. If the difference in log likelihood between\n", " two epochs is less than this value, terminate training. Default is\n", " 1e-5.\n", " verbose : bool\n", " Print training stats after each epoch. Default is True.\n", "\n", " Returns\n", " -------\n", " A : :py:class:`ndarray ` of shape `(N, N)`\n", " The estimated transition matrix.\n", " B : :py:class:`ndarray ` of shape `(N, V)`\n", " The estimated emission matrix.\n", " pi : :py:class:`ndarray ` of shape `(N,)`\n", " The estimated prior probabilities of each latent state.\n", " \"\"\"\n", " # observations\n", " if O.ndim == 1:\n", " O = O.reshape(1, -1) # noqa: E741\n", "\n", " # number of training examples (I) and their lengths (T)\n", " I, T = O.shape\n", "\n", " # number of types of observation\n", " self.derived_variables[\"V\"] = len(observation_types)\n", "\n", " # number of latent state types\n", " self.derived_variables[\"N\"] = len(latent_state_types)\n", "\n", " self._initialize_parameters()\n", "\n", " P = self.parameters\n", "\n", " # iterate E and M steps until convergence criteria is met\n", " step, delta = 0, np.inf\n", " ll_prev = np.sum([self.log_likelihood(o) for o in O])\n", "\n", " while delta > tol:\n", " gamma, xi, phi = self._E_step(O)\n", " P[\"A\"], P[\"B\"], P[\"pi\"] = self._M_step(O, gamma, xi, phi)\n", " ll = np.sum([self.log_likelihood(o) for o in O])\n", " delta = ll - ll_prev\n", " ll_prev = ll\n", " step += 1\n", "\n", " if verbose:\n", " fstr = \"[Epoch {}] LL: {:.3f} Delta: {:.5f}\"\n", " print(fstr.format(step, ll_prev, delta))\n", "\n", " # return A, B, pi\n", "\n", " def _E_step(self, O):\n", " r\"\"\"\n", " Run a single E-step update for the Baum-Welch/Forward-Backward\n", " algorithm. This step estimates ``xi`` and ``gamma``, the excepted\n", " state-state transition counts and the expected state-occupancy counts,\n", " respectively.\n", "\n", " ``xi[i,j,k]`` gives the probability of being in state `i` at time `k`\n", " and state `j` at time `k+1` given the observed sequence `O` and the\n", " current estimates for transition (`A`) and emission (`B`) matrices::\n", "\n", " .. math::\n", "\n", " xi[i,j,k] &= P(q_k=i,q_{k+1}=j \\mid O,A,B,pi) \\\\\n", " &= \\frac{\n", " P(q_k=i,q_{k+1}=j,O \\mid A,B,pi)\n", " }{P(O \\mid A,B,pi)} \\\\\n", " &= \\frac{\n", " P(o_1,o_2,\\ldots,o_k,q_k=i \\mid A,B,pi) \\times\n", " P(q_{k+1}=j \\mid q_k=i) \\times\n", " P(o_{k+1} \\mid q_{k+1}=j) \\times\n", " P(o_{k+2},o_{k+3},\\ldots,o_T \\mid q_{k+1}=j,A,B,pi)\n", " }{P(O \\mid A,B,pi)} \\\\\n", " &= \\frac{\n", " \\mathtt{fwd[j, k] * self.A[j, i] *\n", " self.B[i, o_{k+1}] * bwd[i, k + 1]}\n", " }{\\mathtt{fwd[:, T].sum()}}\n", "\n", " The expected number of transitions from state `i` to state `j` across the\n", " entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``.\n", "\n", " ``gamma[i,j]`` gives the probability of being in state `i` at time `j`\n", "\n", " .. math:: \\mathtt{gamma[i,j]} = P(q_j = i \\mid O, A, B, \\pi)\n", "\n", " Parameters\n", " ----------\n", " O : :py:class:`ndarray ` of shape `(I, T)`\n", " The set of `I` training observations, each of length `T`.\n", "\n", " Returns\n", " -------\n", " gamma : :py:class:`ndarray ` of shape `(I, N, T)`\n", " The estimated state-occupancy count matrix.\n", " xi : :py:class:`ndarray ` of shape `(I, N, N, T)`\n", " The estimated state-state transition count matrix.\n", " phi : :py:class:`ndarray ` of shape `(I, N)`\n", " The estimated prior counts for each latent state.\n", " \"\"\"\n", " I, T = O.shape\n", " P = self.parameters\n", " A, B = P[\"A\"], P[\"B\"]\n", " N = self.derived_variables[\"N\"]\n", " eps = self.hyperparameters[\"eps\"]\n", "\n", " phi = np.zeros((I, N))\n", " gamma = np.zeros((I, N, T))\n", " xi = np.zeros((I, N, N, T))\n", "\n", " for i in range(I):\n", " Obs = O[i, :]\n", " fwd = self._forward(Obs)\n", " bwd = self._backward(Obs)\n", " log_likelihood = logsumexp(fwd[:, T - 1])\n", "\n", " t = T - 1\n", " for si in range(N):\n", " gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood\n", " phi[i, si] = fwd[si, 0] + bwd[si, 0] - log_likelihood\n", "\n", " for t in range(T - 1):\n", " ot1 = Obs[t + 1]\n", " for si in range(N):\n", " gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood\n", " for sj in range(N):\n", " xi[i, si, sj, t] = (\n", " fwd[si, t]\n", " + np.log(A[si, sj] + eps)\n", " + np.log(B[sj, ot1] + eps)\n", " + bwd[sj, t + 1]\n", " - log_likelihood\n", " )\n", "\n", " return gamma, xi, phi\n", "\n", " def _M_step(self, O, gamma, xi, phi):\n", " \"\"\"\n", " Run a single M-step update for the Baum-Welch/Forward-Backward\n", " algorithm.\n", "\n", " Parameters\n", " ----------\n", " O : :py:class:`ndarray ` of shape `(I, T)`\n", " The set of `I` training observations, each of length `T`.\n", " gamma : :py:class:`ndarray ` of shape `(I, N, T)`\n", " The estimated state-occupancy count matrix.\n", " xi : :py:class:`ndarray ` of shape `(I, N, N, T)`\n", " The estimated state-state transition count matrix.\n", " phi : :py:class:`ndarray ` of shape `(I, N)`\n", " The estimated starting count matrix for each latent state.\n", "\n", " Returns\n", " -------\n", " A : :py:class:`ndarray ` of shape `(N, N)`\n", " The estimated transition matrix.\n", " B : :py:class:`ndarray ` of shape `(N, V)`\n", " The estimated emission matrix.\n", " pi : :py:class:`ndarray ` of shape `(N,)`\n", " The estimated prior probabilities for each latent state.\n", " \"\"\"\n", " I, T = O.shape\n", " P = self.parameters\n", " DV = self.derived_variables\n", " eps = self.hyperparameters[\"eps\"]\n", "\n", " N, V = DV[\"N\"], DV[\"V\"]\n", " A, B, pi = P[\"A\"], P[\"B\"], P[\"pi\"]\n", "\n", " # initialize the estimated transition (A) and emission (B) matrices\n", " A = np.zeros((N, N))\n", " B = np.zeros((N, V))\n", " pi = np.zeros(N)\n", "\n", " count_gamma = np.zeros((I, N, V))\n", " count_xi = np.zeros((I, N, N))\n", "\n", " for i in range(I):\n", " Obs = O[i, :]\n", " for si in range(N):\n", " for vk in range(V):\n", " if not (Obs == vk).any():\n", " count_gamma[i, si, vk] = np.log(eps)\n", " else:\n", " count_gamma[i, si, vk] = logsumexp(gamma[i, si, Obs == vk])\n", "\n", " for sj in range(N):\n", " count_xi[i, si, sj] = logsumexp(xi[i, si, sj, :])\n", "\n", " pi = logsumexp(phi, axis=0) - np.log(I + eps)\n", " np.testing.assert_almost_equal(np.exp(pi).sum(), 1)\n", "\n", " for si in range(N):\n", " for vk in range(V):\n", " B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp(\n", " count_gamma[:, si, :] # noqa: C812\n", " )\n", "\n", " for sj in range(N):\n", " A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp(\n", " count_xi[:, si, :] # noqa: C812\n", " )\n", "\n", " np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1)\n", " np.testing.assert_almost_equal(np.exp(B[si, :]).sum(), 1)\n", " return np.exp(A), np.exp(B), np.exp(pi)\n"]} {"path": "numpy_ml/linear_models/glm.py", "content": ["\"\"\"A module for the generalized linear model.\"\"\"\n", "import numpy as np\n", "\n", "from numpy_ml.linear_models.linear_regression import LinearRegression\n", "\n", "eps = np.finfo(float).eps\n", "\n", "_GLM_LINKS = {\n", " \"logit\": {\n", " \"link\": lambda mu: np.log((mu + eps) / (1 - mu + eps)),\n", " \"inv_link\": lambda eta: 1.0 / (1.0 + np.exp(-eta)),\n", " \"link_prime\": lambda x: (1 / (x + eps)) + (1 / (1 - x + eps)),\n", " \"theta\": lambda mu: np.log((mu + eps) / (1 - mu + eps)),\n", " \"phi\": lambda x: np.ones(x.shape[0]),\n", " \"a\": lambda phi: phi,\n", " \"b\": lambda theta: np.log(1 + np.exp(theta)),\n", " \"p\": 1,\n", " \"b_prime\": lambda theta: np.exp(theta) / (1 + np.exp(theta)),\n", " \"b_prime2\": lambda theta: np.exp(theta) / ((1 + np.exp(theta)) ** 2),\n", " },\n", " \"identity\": {\n", " \"link\": lambda mu: mu,\n", " \"inv_link\": lambda eta: eta,\n", " \"link_prime\": lambda x: np.ones_like(x),\n", " \"theta\": lambda mu: mu,\n", " \"phi\": lambda x: np.var(x, axis=0),\n", " \"a\": lambda phi: phi,\n", " \"b\": lambda theta: 0.5 * theta ** 2,\n", " \"p\": 1,\n", " \"b_prime\": lambda theta: theta,\n", " \"b_prime2\": lambda theta: np.ones_like(theta),\n", " },\n", " \"log\": {\n", " \"link\": lambda mu: np.log(mu + eps),\n", " \"inv_link\": lambda eta: np.exp(eta),\n", " \"link_prime\": lambda x: 1 / (x + eps),\n", " \"theta\": lambda mu: np.log(mu + eps),\n", " \"phi\": lambda x: np.ones(x.shape[0]),\n", " \"a\": lambda phi: phi,\n", " \"p\": 1,\n", " \"b\": lambda theta: np.exp(theta),\n", " \"b_prime\": lambda theta: np.exp(theta),\n", " \"b_prime2\": lambda theta: np.exp(theta),\n", " },\n", "}\n", "\n", "\n", "class GeneralizedLinearModel:\n", " def __init__(self, link, fit_intercept=True, tol=1e-5, max_iter=100):\n", " r\"\"\"\n", " A generalized linear model with maximum likelihood fit via\n", " iteratively reweighted least squares (IRLS).\n", "\n", " Notes\n", " -----\n", " The generalized linear model (GLM) [7]_ [8]_ assumes that each target/dependent\n", " variable :math:`y_i` in target vector :math:`\\mathbf{y} = (y_1, \\ldots,\n", " y_n)`, has been drawn independently from a pre-specified distribution\n", " in the exponential family [11]_ with unknown mean :math:`\\mu_i`. The GLM\n", " models a (one-to-one, continuous, differentiable) function, *g*, of\n", " this mean value as a linear combination of the model parameters\n", " :math:`\\mathbf{b}` and observed covariates, :math:`\\mathbf{x}_i`:\n", "\n", " .. math::\n", "\n", " g(\\mathbb{E}[y_i \\mid \\mathbf{x}_i]) =\n", " g(\\mu_i) = \\mathbf{b}^\\top \\mathbf{x}_i\n", "\n", " where *g* is known as the \"link function\" associated with the GLM. The\n", " choice of link function is informed by the instance of the exponential\n", " family the target is drawn from. Common examples:\n", "\n", " .. csv-table::\n", " :header: \"Distribution\", \"Link\", \"Formula\"\n", " :widths: 25, 20, 30\n", "\n", " \"Normal\", \"Identity\", \":math:`g(x) = x`\"\n", " \"Bernoulli\", \"Logit\", \":math:`g(x) = \\log(x) - \\log(1 - x)`\"\n", " \"Binomial\", \"Logit\", \":math:`g(x) = \\log(x) - \\log(n - x)`\"\n", " \"Poisson\", \"Log\", \":math:`g(x) = \\log(x)`\"\n", "\n", " An iteratively re-weighted least squares (IRLS) algorithm [9]_ can be\n", " employed to find the maximum likelihood estimate for the model\n", " parameters :math:`\\beta` in any instance of the generalized linear\n", " model. IRLS is equivalent to Fisher scoring [10]_, which itself is\n", " a slight modification of classic Newton-Raphson for finding the zeros\n", " of the first derivative of the model log-likelihood.\n", "\n", " References\n", " ----------\n", " .. [7] Nelder, J., & Wedderburn, R. (1972). Generalized linear\n", " models. *Journal of the Royal Statistical Society, Series A\n", " (General), 135(3)*: 370\u2013384.\n", " .. [8] https://en.wikipedia.org/wiki/Generalized_linear_model\n", " .. [9] https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares\n", " .. [10] https://en.wikipedia.org/wiki/Scoring_algorithm\n", " .. [11] https://en.wikipedia.org/wiki/Exponential_family\n", "\n", " Parameters\n", " ----------\n", " link: {'identity', 'logit', 'log'}\n", " The link function to use during modeling.\n", " fit_intercept: bool\n", " Whether to fit an intercept term in addition to the model\n", " coefficients. Default is True.\n", " tol : float\n", " The minimum difference between successive iterations of IRLS\n", " Default is 1e-5.\n", " max_iter: int\n", " The maximum number of iteratively reweighted least squares\n", " iterations to run during fitting. Default is 100.\n", "\n", " Attributes\n", " ----------\n", " beta : :py:class:`ndarray ` of shape `(M, 1)` or None\n", " Fitted model coefficients.\n", " \"\"\"\n", " err_str = f\"Valid link functions are {list(_GLM_LINKS.keys())} but got {link}\"\n", " assert link in _GLM_LINKS, err_str\n", "\n", " self._is_fit = False\n", "\n", " self.tol = tol\n", " self.link = link\n", " self.beta = None\n", " self.max_iter = max_iter\n", " self.fit_intercept = fit_intercept\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Find the maximum likelihood GLM coefficients via IRLS.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`.\n", " y : :py:class:`ndarray ` of shape `(N,)`\n", " The targets for each of the `N` examples in `X`.\n", "\n", " Returns\n", " -------\n", " self : :class:`GeneralizedLinearModel ` instance\n", " \"\"\" # noqa: E501\n", " y = np.squeeze(y)\n", " assert y.ndim == 1\n", "\n", " N, M = X.shape\n", " L = _GLM_LINKS[self.link]\n", "\n", " # starting values for parameters\n", " mu = np.ones_like(y) * np.mean(y)\n", " eta = L[\"link\"](mu)\n", " theta = L[\"theta\"](mu)\n", "\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(N), X]\n", "\n", " # IRLS for GLM\n", " i = 0\n", " diff, beta = np.inf, np.inf\n", " while diff > (self.tol * M):\n", " if i > self.max_iter:\n", " print(\"Warning: Model did not converge\")\n", " break\n", "\n", " # compute first-order Taylor approx.\n", " z = eta + (y - mu) * L[\"link_prime\"](mu)\n", " w = L[\"p\"] / (L[\"b_prime2\"](theta) * L[\"link_prime\"](mu) ** 2)\n", "\n", " # perform weighted least-squares on z\n", " wlr = LinearRegression(fit_intercept=False)\n", " beta_new = wlr.fit(X, z, weights=w).beta.ravel()\n", "\n", " eta = X @ beta_new\n", " mu = L[\"inv_link\"](eta)\n", " theta = L[\"theta\"](mu)\n", "\n", " diff = np.linalg.norm(beta - beta_new, ord=1)\n", " beta = beta_new\n", " i += 1\n", "\n", " self.beta = beta\n", " self._is_fit = True\n", " return self\n", "\n", " def predict(self, X):\n", " r\"\"\"\n", " Use the trained model to generate predictions for the distribution\n", " means, :math:`\\mu`, associated with the collection of data points in\n", " **X**.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(Z, M)`\n", " A dataset consisting of `Z` new examples, each of dimension `M`.\n", "\n", " Returns\n", " -------\n", " mu_pred : :py:class:`ndarray ` of shape `(Z,)`\n", " The model predictions for the expected value of the target\n", " associated with each item in `X`.\n", " \"\"\"\n", " assert self._is_fit, \"Must call `fit` before generating predictions\"\n", " L = _GLM_LINKS[self.link]\n", "\n", " # convert X to a design matrix if we're using an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " mu_pred = L[\"inv_link\"](X @ self.beta)\n", " return mu_pred.ravel()\n"]} {"path": "numpy_ml/linear_models/logistic.py", "content": ["\"\"\"Logistic regression module\"\"\"\n", "import numpy as np\n", "\n", "\n", "class LogisticRegression:\n", " def __init__(self, penalty=\"l2\", gamma=0, fit_intercept=True):\n", " r\"\"\"\n", " A simple binary logistic regression model fit via gradient descent on\n", " the penalized negative log likelihood.\n", "\n", " Notes\n", " -----\n", " In simple binary logistic regression, the entries in a binary target\n", " vector :math:`\\mathbf{y} = (y_1, \\ldots, y_N)` are assumed to have been\n", " drawn from a series of independent Bernoulli random variables with\n", " expected values :math:`p_1, \\ldots, p_N`. The binary logistic regession\n", " model models the logit of these unknown mean parameters as a linear\n", " function of the model coefficients, :math:`\\mathbf{b}`, and the\n", " covariates for the corresponding example, :math:`\\mathbf{x}_i`:\n", "\n", " .. math::\n", "\n", " \\text{Logit}(p_i) =\n", " \\log \\left( \\frac{p_i}{1 - p_i} \\right) = \\mathbf{b}^\\top\\mathbf{x}_i\n", "\n", " The model predictions :math:`\\hat{\\mathbf{y}}` are the expected values\n", " of the Bernoulli parameters for each example:\n", "\n", " .. math::\n", "\n", " \\hat{y}_i =\n", " \\mathbb{E}[y_i \\mid \\mathbf{x}_i] = \\sigma(\\mathbf{b}^\\top \\mathbf{x}_i)\n", "\n", " where :math:`\\sigma` is the logistic sigmoid function :math:`\\sigma(x)\n", " = \\frac{1}{1 + e^{-x}}`. Under this model, the (penalized) negative log\n", " likelihood of the targets **y** is\n", "\n", " .. math::\n", "\n", " - \\log \\mathcal{L}(\\mathbf{b}, \\mathbf{y}) = -\\frac{1}{N} \\left[\n", " \\left(\n", " \\sum_{i=0}^N y_i \\log(\\hat{y}_i) +\n", " (1-y_i) \\log(1-\\hat{y}_i)\n", " \\right) - R(\\mathbf{b}, \\gamma)\n", " \\right]\n", "\n", " where\n", "\n", " .. math::\n", "\n", " R(\\mathbf{b}, \\gamma) = \\left\\{\n", " \\begin{array}{lr}\n", " \\frac{\\gamma}{2} ||\\mathbf{b}||_2^2 & :\\texttt{ penalty = 'l2'}\\\\\n", " \\gamma ||\\mathbf{b}||_1 & :\\texttt{ penalty = 'l1'}\n", " \\end{array}\n", " \\right.\n", "\n", " is a regularization penalty, :math:`\\gamma` is a regularization weight,\n", " `N` is the number of examples in **y**, :math:`\\hat{y}_i` is the model\n", " prediction on example *i*, and **b** is the vector of model\n", " coefficients.\n", "\n", " Parameters\n", " ----------\n", " penalty : {'l1', 'l2'}\n", " The type of regularization penalty to apply on the coefficients\n", " `beta`. Default is 'l2'.\n", " gamma : float\n", " The regularization weight. Larger values correspond to larger\n", " regularization penalties, and a value of 0 indicates no penalty.\n", " Default is 0.\n", " fit_intercept : bool\n", " Whether to fit an intercept term in addition to the coefficients in\n", " b. If True, the estimates for `beta` will have `M + 1` dimensions,\n", " where the first dimension corresponds to the intercept. Default is\n", " True.\n", "\n", " Attributes\n", " ----------\n", " beta : :py:class:`ndarray ` of shape `(M, 1)` or None\n", " Fitted model coefficients.\n", " \"\"\"\n", " err_msg = \"penalty must be 'l1' or 'l2', but got: {}\".format(penalty)\n", " assert penalty in [\"l2\", \"l1\"], err_msg\n", " self.beta = None\n", " self.gamma = gamma\n", " self.penalty = penalty\n", " self.fit_intercept = fit_intercept\n", "\n", " def fit(self, X, y, lr=0.01, tol=1e-7, max_iter=1e7):\n", " \"\"\"\n", " Fit the regression coefficients via gradient descent on the negative\n", " log likelihood.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`.\n", " y : :py:class:`ndarray ` of shape `(N,)`\n", " The binary targets for each of the `N` examples in `X`.\n", " lr : float\n", " The gradient descent learning rate. Default is 1e-7.\n", " max_iter : float\n", " The maximum number of iterations to run the gradient descent\n", " solver. Default is 1e7.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " l_prev = np.inf\n", " self.beta = np.random.rand(X.shape[1])\n", " for _ in range(int(max_iter)):\n", " y_pred = _sigmoid(X @ self.beta)\n", " loss = self._NLL(X, y, y_pred)\n", " if l_prev - loss < tol:\n", " return\n", " l_prev = loss\n", " self.beta -= lr * self._NLL_grad(X, y, y_pred)\n", "\n", " def _NLL(self, X, y, y_pred):\n", " r\"\"\"\n", " Penalized negative log likelihood of the targets under the current\n", " model.\n", "\n", " .. math::\n", "\n", " \\text{NLL} = -\\frac{1}{N} \\left[\n", " \\left(\n", " \\sum_{i=0}^N y_i \\log(\\hat{y}_i) + (1-y_i) \\log(1-\\hat{y}_i)\n", " \\right) - R(\\mathbf{b}, \\gamma)\n", " \\right]\n", " \"\"\"\n", " N, M = X.shape\n", " beta, gamma = self.beta, self.gamma\n", " order = 2 if self.penalty == \"l2\" else 1\n", " norm_beta = np.linalg.norm(beta, ord=order)\n", "\n", " nll = -np.log(y_pred[y == 1]).sum() - np.log(1 - y_pred[y == 0]).sum()\n", " penalty = (gamma / 2) * norm_beta ** 2 if order == 2 else gamma * norm_beta\n", " return (penalty + nll) / N\n", "\n", " def _NLL_grad(self, X, y, y_pred):\n", " \"\"\"Gradient of the penalized negative log likelihood wrt beta\"\"\"\n", " N, M = X.shape\n", " p, beta, gamma = self.penalty, self.beta, self.gamma\n", " d_penalty = gamma * beta if p == \"l2\" else gamma * np.sign(beta)\n", " return -((y - y_pred) @ X + d_penalty) / N\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Use the trained model to generate prediction probabilities on a new\n", " collection of data points.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(Z, M)`\n", " A dataset consisting of `Z` new examples, each of dimension `M`.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(Z,)`\n", " The model prediction probabilities for the items in `X`.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", " return _sigmoid(X @ self.beta)\n", "\n", "\n", "def _sigmoid(x):\n", " \"\"\"The logistic sigmoid function\"\"\"\n", " return 1 / (1 + np.exp(-x))\n"]} {"path": "numpy_ml/linear_models/__init__.py", "content": ["\"\"\"A module containing assorted linear models.\"\"\"\n", "\n", "from .ridge import RidgeRegression\n", "from .glm import GeneralizedLinearModel\n", "from .logistic import LogisticRegression\n", "from .bayesian_regression import (\n", " BayesianLinearRegressionKnownVariance,\n", " BayesianLinearRegressionUnknownVariance,\n", ")\n", "from .naive_bayes import GaussianNBClassifier\n", "from .linear_regression import LinearRegression\n"]} {"path": "numpy_ml/linear_models/naive_bayes.py", "content": ["\"\"\"A module for naive Bayes classifiers\"\"\"\n", "import numpy as np\n", "\n", "\n", "class GaussianNBClassifier:\n", " def __init__(self, eps=1e-6):\n", " r\"\"\"\n", " A naive Bayes classifier for real-valued data.\n", "\n", " Notes\n", " -----\n", " The naive Bayes model assumes the features of each training example\n", " :math:`\\mathbf{x}` are mutually independent given the example label\n", " *y*:\n", "\n", " .. math::\n", "\n", " P(\\mathbf{x}_i \\mid y_i) = \\prod_{j=1}^M P(x_{i,j} \\mid y_i)\n", "\n", " where :math:`M` is the rank of the :math:`i^{th}` example\n", " :math:`\\mathbf{x}_i` and :math:`y_i` is the label associated with the\n", " :math:`i^{th}` example.\n", "\n", " Combining the conditional independence assumption with a simple\n", " application of Bayes' theorem gives the naive Bayes classification\n", " rule:\n", "\n", " .. math::\n", "\n", " \\hat{y} &= \\arg \\max_y P(y \\mid \\mathbf{x}) \\\\\n", " &= \\arg \\max_y P(y) P(\\mathbf{x} \\mid y) \\\\\n", " &= \\arg \\max_y P(y) \\prod_{j=1}^M P(x_j \\mid y)\n", "\n", " In the final expression, the prior class probability :math:`P(y)` can\n", " be specified in advance or estimated empirically from the training\n", " data.\n", "\n", " In the Gaussian version of the naive Bayes model, the feature\n", " likelihood is assumed to be normally distributed for each class:\n", "\n", " .. math::\n", "\n", " \\mathbf{x}_i \\mid y_i = c, \\theta \\sim \\mathcal{N}(\\mu_c, \\Sigma_c)\n", "\n", " where :math:`\\theta` is the set of model parameters: :math:`\\{\\mu_1,\n", " \\Sigma_1, \\ldots, \\mu_K, \\Sigma_K\\}`, :math:`K` is the total number of\n", " unique classes present in the data, and the parameters for the Gaussian\n", " associated with class :math:`c`, :math:`\\mu_c` and :math:`\\Sigma_c`\n", " (where :math:`1 \\leq c \\leq K`), are estimated via MLE from the set of\n", " training examples with label :math:`c`.\n", "\n", " Parameters\n", " ----------\n", " eps : float\n", " A value added to the variance to prevent numerical error. Default\n", " is 1e-6.\n", "\n", " Attributes\n", " ----------\n", " parameters : dict\n", " Dictionary of model parameters: \"mean\", the `(K, M)` array of\n", " feature means under each class, \"sigma\", the `(K, M)` array of\n", " feature variances under each class, and \"prior\", the `(K,)` array of\n", " empirical prior probabilities for each class label.\n", " hyperparameters : dict\n", " Dictionary of model hyperparameters\n", " labels : :py:class:`ndarray ` of shape `(K,)`\n", " An array containing the unique class labels for the training\n", " examples.\n", " \"\"\"\n", " self.labels = None\n", " self.hyperparameters = {\"eps\": eps}\n", " self.parameters = {\n", " \"mean\": None, # shape: (K, M)\n", " \"sigma\": None, # shape: (K, M)\n", " \"prior\": None, # shape: (K,)\n", " }\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Fit the model parameters via maximum likelihood.\n", "\n", " Notes\n", " -----\n", " The model parameters are stored in the :py:attr:`parameters\n", " ` attribute.\n", " The following keys are present:\n", "\n", " \"mean\": :py:class:`ndarray ` of shape `(K, M)`\n", " Feature means for each of the `K` label classes\n", " \"sigma\": :py:class:`ndarray ` of shape `(K, M)`\n", " Feature variances for each of the `K` label classes\n", " \"prior\": :py:class:`ndarray ` of shape `(K,)`\n", " Prior probability of each of the `K` label classes, estimated\n", " empirically from the training data\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`\n", " y: :py:class:`ndarray ` of shape `(N,)`\n", " The class label for each of the `N` examples in `X`\n", "\n", " Returns\n", " -------\n", " self : :class:`GaussianNBClassifier ` instance\n", " \"\"\" # noqa: E501\n", " P = self.parameters\n", " H = self.hyperparameters\n", "\n", " self.labels = np.unique(y)\n", "\n", " K = len(self.labels)\n", " N, M = X.shape\n", "\n", " P[\"mean\"] = np.zeros((K, M))\n", " P[\"sigma\"] = np.zeros((K, M))\n", " P[\"prior\"] = np.zeros((K,))\n", "\n", " for i, c in enumerate(self.labels):\n", " X_c = X[y == c, :]\n", "\n", " P[\"mean\"][i, :] = np.mean(X_c, axis=0)\n", " P[\"sigma\"][i, :] = np.var(X_c, axis=0) + H[\"eps\"]\n", " P[\"prior\"][i] = X_c.shape[0] / N\n", " return self\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Use the trained classifier to predict the class label for each example\n", " in **X**.\n", "\n", " Parameters\n", " ----------\n", " X: :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset of `N` examples, each of dimension `M`\n", "\n", " Returns\n", " -------\n", " labels : :py:class:`ndarray ` of shape `(N)`\n", " The predicted class labels for each example in `X`\n", " \"\"\"\n", " return self.labels[self._log_posterior(X).argmax(axis=1)]\n", "\n", " def _log_posterior(self, X):\n", " r\"\"\"\n", " Compute the (unnormalized) log posterior for each class.\n", "\n", " Parameters\n", " ----------\n", " X: :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset of `N` examples, each of dimension `M`\n", "\n", " Returns\n", " -------\n", " log_posterior : :py:class:`ndarray ` of shape `(N, K)`\n", " Unnormalized log posterior probability of each class for each\n", " example in `X`\n", " \"\"\"\n", " K = len(self.labels)\n", " log_posterior = np.zeros((X.shape[0], K))\n", " for i in range(K):\n", " log_posterior[:, i] = self._log_class_posterior(X, i)\n", " return log_posterior\n", "\n", " def _log_class_posterior(self, X, class_idx):\n", " r\"\"\"\n", " Compute the (unnormalized) log posterior for the label at index\n", " `class_idx` in :py:attr:`labels `.\n", "\n", " Notes\n", " -----\n", " Unnormalized log posterior for example :math:`\\mathbf{x}_i` and class\n", " :math:`c` is::\n", "\n", " .. math::\n", "\n", " \\log P(y_i = c \\mid \\mathbf{x}_i, \\theta)\n", " &\\propto \\log P(y=c \\mid \\theta) +\n", " \\log P(\\mathbf{x}_i \\mid y_i = c, \\theta) \\\\\n", " &\\propto \\log P(y=c \\mid \\theta)\n", " \\sum{j=1}^M \\log P(x_j \\mid y_i = c, \\theta)\n", "\n", " In the Gaussian naive Bayes model, the feature likelihood for class\n", " :math:`c`, :math:`P(\\mathbf{x}_i \\mid y_i = c, \\theta)` is assumed to\n", " be normally distributed\n", "\n", " .. math::\n", "\n", " \\mathbf{x}_i \\mid y_i = c, \\theta \\sim \\mathcal{N}(\\mu_c, \\Sigma_c)\n", "\n", " Parameters\n", " ----------\n", " X: :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset of `N` examples, each of dimension `M`\n", " class_idx : int\n", " The index of the current class in :py:attr:`labels`\n", "\n", " Returns\n", " -------\n", " log_class_posterior : :py:class:`ndarray ` of shape `(N,)`\n", " Unnormalized log probability of the label at index `class_idx`\n", " in :py:attr:`labels `\n", " for each example in `X`\n", " \"\"\" # noqa: E501\n", " P = self.parameters\n", " mu = P[\"mean\"][class_idx]\n", " prior = P[\"prior\"][class_idx]\n", " sigsq = P[\"sigma\"][class_idx]\n", "\n", " # log likelihood = log X | N(mu, sigsq)\n", " log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))\n", " log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)\n", " return log_likelihood + np.log(prior)\n"]} {"path": "numpy_ml/linear_models/bayesian_regression.py", "content": ["\"\"\"A module of Bayesian linear regression models.\"\"\"\n", "import numpy as np\n", "import scipy.stats as stats\n", "\n", "from numpy_ml.utils.testing import is_number, is_symmetric_positive_definite\n", "\n", "\n", "class BayesianLinearRegressionUnknownVariance:\n", " def __init__(self, alpha=1, beta=2, mu=0, V=None, fit_intercept=True):\n", " r\"\"\"\n", " Bayesian linear regression model with unknown variance. Assumes a\n", " conjugate normal-inverse-gamma joint prior on the model parameters and\n", " error variance.\n", "\n", " Notes\n", " -----\n", " The current model uses a conjugate normal-inverse-gamma joint prior on\n", " model parameters **b** and error variance :math:`\\sigma^2`. The joint\n", " and marginal posteriors over each are:\n", "\n", " .. math::\n", "\n", " \\mathbf{b}, \\sigma^2 &\\sim\n", " \\text{N-\\Gamma^{-1}}(\\mu, \\mathbf{V}^{-1}, \\alpha, \\beta) \\\\\n", " \\sigma^2 &\\sim \\text{InverseGamma}(\\alpha, \\beta) \\\\\n", " \\mathbf{b} \\mid \\sigma^2 &\\sim \\mathcal{N}(\\mu, \\sigma^2 \\mathbf{V})\n", "\n", " Parameters\n", " ----------\n", " alpha : float\n", " The shape parameter for the Inverse-Gamma prior on\n", " :math:`\\sigma^2`. Must be strictly greater than 0. Default is 1.\n", " beta : float\n", " The scale parameter for the Inverse-Gamma prior on\n", " :math:`\\sigma^2`. Must be strictly greater than 0. Default is 1.\n", " mu : :py:class:`ndarray ` of shape `(M,)` or float\n", " The mean of the Gaussian prior on `b`. If a float, assume `mu`\n", " is ``np.ones(M) * mu``. Default is 0.\n", " V : :py:class:`ndarray ` of shape `(N, N)` or `(N,)` or None\n", " A symmetric positive definite matrix that when multiplied\n", " element-wise by :math:`\\sigma^2` gives the covariance matrix for\n", " the Gaussian prior on `b`. If a list, assume ``V = diag(V)``. If\n", " None, assume `V` is the identity matrix. Default is None.\n", " fit_intercept : bool\n", " Whether to fit an intercept term in addition to the coefficients in\n", " b. If True, the estimates for b will have `M + 1` dimensions, where\n", " the first dimension corresponds to the intercept. Default is True.\n", "\n", " Attributes\n", " ----------\n", " posterior : dict or None\n", " Frozen random variables for the posterior distributions\n", " :math:`P(\\sigma^2 \\mid X)` and :math:`P(b \\mid X, \\sigma^2)`.\n", " posterior_predictive : dict or None\n", " Frozen random variable for the posterior predictive distribution,\n", " :math:`P(y \\mid X)`. This value is only set following a call to\n", " :meth:`predict `.\n", " \"\"\" # noqa: E501\n", " # this is a placeholder until we know the dimensions of X\n", " V = 1.0 if V is None else V\n", "\n", " if isinstance(V, list):\n", " V = np.array(V)\n", "\n", " if isinstance(V, np.ndarray):\n", " if V.ndim == 1:\n", " V = np.diag(V)\n", " elif V.ndim == 2:\n", " fstr = \"V must be symmetric positive definite\"\n", " assert is_symmetric_positive_definite(V), fstr\n", "\n", " self.V = V\n", " self.mu = mu\n", " self.beta = beta\n", " self.alpha = alpha\n", " self.fit_intercept = fit_intercept\n", "\n", " self.posterior = None\n", " self.posterior_predictive = None\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Compute the posterior over model parameters using the data in `X` and\n", " `y`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`.\n", " y : :py:class:`ndarray ` of shape `(N, K)`\n", " The targets for each of the `N` examples in `X`, where each target\n", " has dimension `K`.\n", "\n", " Returns\n", " -------\n", " self : :class:`BayesianLinearRegressionUnknownVariance` instance\n", " \"\"\" # noqa: E501\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " N, M = X.shape\n", " alpha, beta, V, mu = self.alpha, self.beta, self.V, self.mu\n", "\n", " if is_number(V):\n", " V *= np.eye(M)\n", "\n", " if is_number(mu):\n", " mu *= np.ones(M)\n", "\n", " # sigma\n", " I = np.eye(N) # noqa: E741\n", " a = y - (X @ mu)\n", " b = np.linalg.inv(X @ V @ X.T + I)\n", " c = y - (X @ mu)\n", "\n", " shape = N + alpha\n", " sigma = (1 / shape) * (alpha * beta ** 2 + a @ b @ c)\n", " scale = sigma ** 2\n", "\n", " # sigma is the mode of the inverse gamma prior on sigma^2\n", " sigma = scale / (shape - 1)\n", "\n", " # mean\n", " V_inv = np.linalg.inv(V)\n", " L = np.linalg.inv(V_inv + X.T @ X)\n", " R = V_inv @ mu + X.T @ y\n", "\n", " mu = L @ R\n", " cov = L * sigma\n", "\n", " # posterior distribution for sigma^2 and b\n", " self.posterior = {\n", " \"sigma**2\": stats.distributions.invgamma(a=shape, scale=scale),\n", " \"b | sigma**2\": stats.multivariate_normal(mean=mu, cov=cov),\n", " }\n", " return self\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Return the MAP prediction for the targets associated with `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(Z, M)`\n", " A dataset consisting of `Z` new examples, each of dimension `M`.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(Z, K)`\n", " The model predictions for the items in `X`.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " I = np.eye(X.shape[0]) # noqa: E741\n", " mu = X @ self.posterior[\"b | sigma**2\"].mean\n", " cov = X @ self.posterior[\"b | sigma**2\"].cov @ X.T + I\n", "\n", " # MAP estimate for y corresponds to the mean of the posterior\n", " # predictive\n", " self.posterior_predictive = stats.multivariate_normal(mu, cov)\n", " return mu\n", "\n", "\n", "class BayesianLinearRegressionKnownVariance:\n", " def __init__(self, mu=0, sigma=1, V=None, fit_intercept=True):\n", " r\"\"\"\n", " Bayesian linear regression model with known error variance and\n", " conjugate Gaussian prior on model parameters.\n", "\n", " Notes\n", " -----\n", " Uses a conjugate Gaussian prior on the model coefficients **b**. The\n", " posterior over model coefficients is then\n", "\n", " .. math::\n", "\n", " \\mathbf{b} \\mid \\mu, \\sigma^2, \\mathbf{V}\n", " \\sim \\mathcal{N}(\\mu, \\sigma^2 \\mathbf{V})\n", "\n", " Ridge regression is a special case of this model where :math:`\\mu =\n", " \\mathbf{0}`, :math:`\\sigma = 1` and :math:`\\mathbf{V} = \\mathbf{I}`\n", " (ie., the prior on the model coefficients **b** is a zero-mean, unit\n", " covariance Gaussian).\n", "\n", " Parameters\n", " ----------\n", " mu : :py:class:`ndarray ` of shape `(M,)` or float\n", " The mean of the Gaussian prior on `b`. If a float, assume `mu` is\n", " ``np.ones(M) * mu``. Default is 0.\n", " sigma : float\n", " The square root of the scaling term for covariance of the Gaussian\n", " prior on `b`. Default is 1.\n", " V : :py:class:`ndarray ` of shape `(N,N)` or `(N,)` or None\n", " A symmetric positive definite matrix that when multiplied\n", " element-wise by ``sigma ** 2`` gives the covariance matrix for the\n", " Gaussian prior on `b`. If a list, assume ``V = diag(V)``. If None,\n", " assume `V` is the identity matrix. Default is None.\n", " fit_intercept : bool\n", " Whether to fit an intercept term in addition to the coefficients in\n", " `b`. If True, the estimates for `b` will have `M + 1` dimensions, where\n", " the first dimension corresponds to the intercept. Default is True.\n", "\n", " Attributes\n", " ----------\n", " posterior : dict or None\n", " Frozen random variable for the posterior distribution :math:`P(b\n", " \\mid X, \\sigma^2)`.\n", " posterior_predictive : dict or None\n", " Frozen random variable for the posterior predictive distribution,\n", " :math:`P(y \\mid X)`. This value is only set following a call to\n", " :meth:`predict `.\n", " \"\"\" # noqa: E501\n", " # this is a placeholder until we know the dimensions of X\n", " V = 1.0 if V is None else V\n", "\n", " if isinstance(V, list):\n", " V = np.array(V)\n", "\n", " if isinstance(V, np.ndarray):\n", " if V.ndim == 1:\n", " V = np.diag(V)\n", " elif V.ndim == 2:\n", " fstr = \"V must be symmetric positive definite\"\n", " assert is_symmetric_positive_definite(V), fstr\n", "\n", " self.posterior = {}\n", " self.posterior_predictive = {}\n", "\n", " self.V = V\n", " self.mu = mu\n", " self.sigma = sigma\n", " self.fit_intercept = fit_intercept\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Compute the posterior over model parameters using the data in `X` and\n", " `y`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`.\n", " y : :py:class:`ndarray ` of shape `(N, K)`\n", " The targets for each of the `N` examples in `X`, where each target\n", " has dimension `K`.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " N, M = X.shape\n", "\n", " if is_number(self.V):\n", " self.V *= np.eye(M)\n", "\n", " if is_number(self.mu):\n", " self.mu *= np.ones(M)\n", "\n", " V = self.V\n", " mu = self.mu\n", " sigma = self.sigma\n", "\n", " V_inv = np.linalg.inv(V)\n", " L = np.linalg.inv(V_inv + X.T @ X)\n", " R = V_inv @ mu + X.T @ y\n", "\n", " mu = L @ R\n", " cov = L * sigma ** 2\n", "\n", " # posterior distribution over b conditioned on sigma\n", " self.posterior[\"b\"] = stats.multivariate_normal(mu, cov)\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Return the MAP prediction for the targets associated with `X`.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(Z, M)`\n", " A dataset consisting of `Z` new examples, each of dimension `M`.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(Z, K)`\n", " The MAP predictions for the targets associated with the items in\n", " `X`.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " I = np.eye(X.shape[0]) # noqa: E741\n", " mu = X @ self.posterior[\"b\"].mean\n", " cov = X @ self.posterior[\"b\"].cov @ X.T + I\n", "\n", " # MAP estimate for y corresponds to the mean/mode of the gaussian\n", " # posterior predictive distribution\n", " self.posterior_predictive = stats.multivariate_normal(mu, cov)\n", " return mu\n"]} {"path": "numpy_ml/linear_models/linear_regression.py", "content": ["\"\"\"Linear regression module.\"\"\"\n", "\n", "import numpy as np\n", "\n", "\n", "class LinearRegression:\n", " def __init__(self, fit_intercept=True):\n", " r\"\"\"\n", " A weighted linear least-squares regression model.\n", "\n", " Notes\n", " -----\n", " In weighted linear least-squares regression [1]_, a real-valued target\n", " vector, **y**, is modeled as a linear combination of covariates, **X**,\n", " and model coefficients, :math:`\\beta`:\n", "\n", " .. math::\n", "\n", " y_i = \\beta^\\top \\mathbf{x}_i + \\epsilon_i\n", "\n", " In this equation :math:`\\epsilon_i \\sim \\mathcal{N}(0, \\sigma^2_i)` is\n", " the error term associated with example :math:`i`, and\n", " :math:`\\sigma^2_i` is the variance of the corresponding example.\n", "\n", " Under this model, the maximum-likelihood estimate for the regression\n", " coefficients, :math:`\\beta`, is:\n", "\n", " .. math::\n", "\n", " \\hat{\\beta} = \\Sigma^{-1} \\mathbf{X}^\\top \\mathbf{Wy}\n", "\n", " where :math:`\\Sigma^{-1} = (\\mathbf{X}^\\top \\mathbf{WX})^{-1}` and\n", " **W** is a diagonal matrix of weights, with each entry inversely\n", " proportional to the variance of the corresponding measurement. When\n", " **W** is the identity matrix the examples are weighted equally and the\n", " model reduces to standard linear least squares [2]_.\n", "\n", " References\n", " ----------\n", " .. [1] https://en.wikipedia.org/wiki/Weighted_least_squares\n", " .. [2] https://en.wikipedia.org/wiki/General_linear_model\n", "\n", " Parameters\n", " ----------\n", " fit_intercept : bool\n", " Whether to fit an intercept term in addition to the model\n", " coefficients. Default is True.\n", "\n", " Attributes\n", " ----------\n", " beta : :py:class:`ndarray ` of shape `(M, K)` or None\n", " Fitted model coefficients.\n", " sigma_inv : :py:class:`ndarray ` of shape `(N, N)` or None\n", " Inverse of the data covariance matrix.\n", " \"\"\"\n", " self.beta = None\n", " self.sigma_inv = None\n", " self.fit_intercept = fit_intercept\n", "\n", " self._is_fit = False\n", "\n", " def update(self, X, y, weights=None):\n", " r\"\"\"\n", " Incrementally update the linear least-squares coefficients for a set of\n", " new examples.\n", "\n", " Notes\n", " -----\n", " The recursive least-squares algorithm [3]_ [4]_ is used to efficiently\n", " update the regression parameters as new examples become available. For\n", " a single new example :math:`(\\mathbf{x}_{t+1}, \\mathbf{y}_{t+1})`, the\n", " parameter updates are\n", "\n", " .. math::\n", "\n", " \\beta_{t+1} = \\left(\n", " \\mathbf{X}_{1:t}^\\top \\mathbf{X}_{1:t} +\n", " \\mathbf{x}_{t+1}\\mathbf{x}_{t+1}^\\top \\right)^{-1}\n", " \\mathbf{X}_{1:t}^\\top \\mathbf{Y}_{1:t} +\n", " \\mathbf{x}_{t+1}^\\top \\mathbf{y}_{t+1}\n", "\n", " where :math:`\\beta_{t+1}` are the updated regression coefficients,\n", " :math:`\\mathbf{X}_{1:t}` and :math:`\\mathbf{Y}_{1:t}` are the set of\n", " examples observed from timestep 1 to *t*.\n", "\n", " In the single-example case, the RLS algorithm uses the Sherman-Morrison\n", " formula [5]_ to avoid re-inverting the covariance matrix on each new\n", " update. In the multi-example case (i.e., where :math:`\\mathbf{X}_{t+1}`\n", " and :math:`\\mathbf{y}_{t+1}` are matrices of `N` examples each), we use\n", " the generalized Woodbury matrix identity [6]_ to update the inverse\n", " covariance. This comes at a performance cost, but is still more\n", " performant than doing multiple single-example updates if *N* is large.\n", "\n", " References\n", " ----------\n", " .. [3] Gauss, C. F. (1821) *Theoria combinationis observationum\n", " erroribus minimis obnoxiae*, Werke, 4. Gottinge\n", " .. [4] https://en.wikipedia.org/wiki/Recursive_least_squares_filter\n", " .. [5] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula\n", " .. [6] https://en.wikipedia.org/wiki/Woodbury_matrix_identity\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`\n", " y : :py:class:`ndarray ` of shape `(N, K)`\n", " The targets for each of the `N` examples in `X`, where each target\n", " has dimension `K`\n", " weights : :py:class:`ndarray ` of shape `(N,)` or None\n", " Weights associated with the examples in `X`. Examples\n", " with larger weights exert greater influence on model fit. When\n", " `y` is a vector (i.e., `K = 1`), weights should be set to the\n", " reciporical of the variance for each measurement (i.e., :math:`w_i\n", " = 1/\\sigma^2_i`). When `K > 1`, it is assumed that all columns of\n", " `y` share the same weight :math:`w_i`. If None, examples are\n", " weighted equally, resulting in the standard linear least squares\n", " update. Default is None.\n", "\n", " Returns\n", " -------\n", " self : :class:`LinearRegression ` instance\n", " \"\"\" # noqa: E501\n", " if not self._is_fit:\n", " raise RuntimeError(\"You must call the `fit` method before calling `update`\")\n", "\n", " X, y = np.atleast_2d(X), np.atleast_2d(y)\n", "\n", " X1, Y1 = X.shape[0], y.shape[0]\n", " weights = np.ones(X1) if weights is None else np.atleast_1d(weights)\n", " weights = np.squeeze(weights) if weights.size > 1 else weights\n", "\n", " err_str = f\"weights must have shape ({X1},) but got {weights.shape}\"\n", " assert weights.shape == (X1,), err_str\n", "\n", " # scale X and y by the weight associated with each example\n", " W = np.diag(np.sqrt(weights))\n", " X, y = W @ X, W @ y\n", "\n", " self._update1D(X, y, W) if X1 == Y1 == 1 else self._update2D(X, y, W)\n", " return self\n", "\n", " def _update1D(self, x, y, w):\n", " \"\"\"Sherman-Morrison update for a single example\"\"\"\n", " beta, S_inv = self.beta, self.sigma_inv\n", "\n", " # convert x to a design vector if we're fitting an intercept\n", " if self.fit_intercept:\n", " x = np.c_[np.diag(w), x]\n", "\n", " # update the inverse of the covariance matrix via Sherman-Morrison\n", " S_inv -= (S_inv @ x.T @ x @ S_inv) / (1 + x @ S_inv @ x.T)\n", "\n", " # update the model coefficients\n", " beta += S_inv @ x.T @ (y - x @ beta)\n", "\n", " def _update2D(self, X, y, W):\n", " \"\"\"Woodbury update for multiple examples\"\"\"\n", " beta, S_inv = self.beta, self.sigma_inv\n", "\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.diag(W), X]\n", "\n", " I = np.eye(X.shape[0]) # noqa: E741\n", "\n", " # update the inverse of the covariance matrix via Woodbury identity\n", " S_inv -= S_inv @ X.T @ np.linalg.pinv(I + X @ S_inv @ X.T) @ X @ S_inv\n", "\n", " # update the model coefficients\n", " beta += S_inv @ X.T @ (y - X @ beta)\n", "\n", " def fit(self, X, y, weights=None):\n", " r\"\"\"\n", " Fit regression coefficients via maximum likelihood.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`.\n", " y : :py:class:`ndarray ` of shape `(N, K)`\n", " The targets for each of the `N` examples in `X`, where each target\n", " has dimension `K`.\n", " weights : :py:class:`ndarray ` of shape `(N,)` or None\n", " Weights associated with the examples in `X`. Examples\n", " with larger weights exert greater influence on model fit. When\n", " `y` is a vector (i.e., `K = 1`), weights should be set to the\n", " reciporical of the variance for each measurement (i.e., :math:`w_i\n", " = 1/\\sigma^2_i`). When `K > 1`, it is assumed that all columns of\n", " `y` share the same weight :math:`w_i`. If None, examples are\n", " weighted equally, resulting in the standard linear least squares\n", " update. Default is None.\n", "\n", " Returns\n", " -------\n", " self : :class:`LinearRegression ` instance\n", " \"\"\" # noqa: E501\n", " N = X.shape[0]\n", "\n", " weights = np.ones(N) if weights is None else np.atleast_1d(weights)\n", " weights = np.squeeze(weights) if weights.size > 1 else weights\n", " err_str = f\"weights must have shape ({N},) but got {weights.shape}\"\n", " assert weights.shape == (N,), err_str\n", "\n", " # scale X and y by the weight associated with each example\n", " W = np.diag(np.sqrt(weights))\n", " X, y = W @ X, W @ y\n", "\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.sqrt(weights), X]\n", "\n", " self.sigma_inv = np.linalg.pinv(X.T @ X)\n", " self.beta = np.atleast_2d(self.sigma_inv @ X.T @ y)\n", "\n", " self._is_fit = True\n", " return self\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Use the trained model to generate predictions on a new collection of\n", " data points.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(Z, M)`\n", " A dataset consisting of `Z` new examples, each of dimension `M`.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(Z, K)`\n", " The model predictions for the items in `X`.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", " return X @ self.beta\n"]} {"path": "numpy_ml/linear_models/ridge.py", "content": ["\"\"\"Ridge regression module\"\"\"\n", "\n", "import numpy as np\n", "\n", "\n", "class RidgeRegression:\n", " def __init__(self, alpha=1, fit_intercept=True):\n", " r\"\"\"\n", " A ridge regression model with maximum likelihood fit via the normal\n", " equations.\n", "\n", " Notes\n", " -----\n", " Ridge regression is a biased estimator for linear models which adds an\n", " additional penalty proportional to the L2-norm of the model\n", " coefficients to the standard mean-squared-error loss:\n", "\n", " .. math::\n", "\n", " \\mathcal{L}_{Ridge} = (\\mathbf{y} - \\mathbf{X} \\beta)^\\top\n", " (\\mathbf{y} - \\mathbf{X} \\beta) + \\alpha ||\\beta||_2^2\n", "\n", " where :math:`\\alpha` is a weight controlling the severity of the\n", " penalty.\n", "\n", " Given data matrix **X** and target vector **y**, the maximum-likelihood\n", " estimate for ridge coefficients, :math:`\\beta`, is:\n", "\n", " .. math::\n", "\n", " \\hat{\\beta} =\n", " \\left(\\mathbf{X}^\\top \\mathbf{X} + \\alpha \\mathbf{I} \\right)^{-1}\n", " \\mathbf{X}^\\top \\mathbf{y}\n", "\n", " It turns out that this estimate for :math:`\\beta` also corresponds to\n", " the MAP estimate if we assume a multivariate Gaussian prior on the\n", " model coefficients, assuming that the data matrix **X** has been\n", " standardized and the target values **y** centered at 0:\n", "\n", " .. math::\n", "\n", " \\beta \\sim \\mathcal{N}\\left(\\mathbf{0}, \\frac{1}{2M} \\mathbf{I}\\right)\n", "\n", " Parameters\n", " ----------\n", " alpha : float\n", " L2 regularization coefficient. Larger values correspond to larger\n", " penalty on the L2 norm of the model coefficients. Default is 1.\n", " fit_intercept : bool\n", " Whether to fit an additional intercept term. Default is True.\n", "\n", " Attributes\n", " ----------\n", " beta : :py:class:`ndarray ` of shape `(M, K)` or None\n", " Fitted model coefficients.\n", " \"\"\"\n", " self.beta = None\n", " self.alpha = alpha\n", " self.fit_intercept = fit_intercept\n", "\n", " def fit(self, X, y):\n", " \"\"\"\n", " Fit the regression coefficients via maximum likelihood.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(N, M)`\n", " A dataset consisting of `N` examples, each of dimension `M`.\n", " y : :py:class:`ndarray ` of shape `(N, K)`\n", " The targets for each of the `N` examples in `X`, where each target\n", " has dimension `K`.\n", "\n", " Returns\n", " -------\n", " self : :class:`RidgeRegression ` instance\n", " \"\"\" # noqa: E501\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", "\n", " A = self.alpha * np.eye(X.shape[1])\n", " pseudo_inverse = np.linalg.inv(X.T @ X + A) @ X.T\n", " self.beta = pseudo_inverse @ y\n", " return self\n", "\n", " def predict(self, X):\n", " \"\"\"\n", " Use the trained model to generate predictions on a new collection of\n", " data points.\n", "\n", " Parameters\n", " ----------\n", " X : :py:class:`ndarray ` of shape `(Z, M)`\n", " A dataset consisting of `Z` new examples, each of dimension `M`.\n", "\n", " Returns\n", " -------\n", " y_pred : :py:class:`ndarray ` of shape `(Z, K)`\n", " The model predictions for the items in `X`.\n", " \"\"\"\n", " # convert X to a design matrix if we're fitting an intercept\n", " if self.fit_intercept:\n", " X = np.c_[np.ones(X.shape[0]), X]\n", " return np.dot(X, self.beta)\n"]} {"path": "numpy_ml/plots/lda_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"paper\", font_scale=1)\n", "\n", "np.random.seed(12345)\n", "\n", "from numpy_ml.lda import LDA\n", "\n", "\n", "def generate_corpus():\n", " # Generate some fake data\n", " D = 300\n", " T = 10\n", " V = 30\n", " N = np.random.randint(150, 200, size=D)\n", "\n", " # Create a document-topic distribution for 3 different types of documents\n", " alpha1 = np.array((20, 15, 10, 1, 1, 1, 1, 1, 1, 1))\n", " alpha2 = np.array((1, 1, 1, 10, 15, 20, 1, 1, 1, 1))\n", " alpha3 = np.array((1, 1, 1, 1, 1, 1, 10, 12, 15, 18))\n", "\n", " # Arbitrarily choose each topic to have 3 very common, diagnostic words\n", " # These words are barely shared with any other topic\n", " beta_probs = (\n", " np.ones((V, T)) + np.array([np.arange(V) % T == t for t in range(T)]).T * 19\n", " )\n", " beta_gen = np.array(list(map(lambda x: np.random.dirichlet(x), beta_probs.T))).T\n", "\n", " corpus = []\n", " theta = np.empty((D, T))\n", "\n", " # Generate each document from the LDA model\n", " for d in range(D):\n", "\n", " # Draw topic distribution for the document\n", " if d < (D / 3):\n", " theta[d, :] = np.random.dirichlet(alpha1, 1)[0]\n", " elif d < 2 * (D / 3):\n", " theta[d, :] = np.random.dirichlet(alpha2, 1)[0]\n", " else:\n", " theta[d, :] = np.random.dirichlet(alpha3, 1)[0]\n", "\n", " doc = np.array([])\n", " for n in range(N[d]):\n", " # Draw a topic according to the document's topic distribution\n", " z_n = np.random.choice(np.arange(T), p=theta[d, :])\n", "\n", " # Draw a word according to the topic-word distribution\n", " w_n = np.random.choice(np.arange(V), p=beta_gen[:, z_n])\n", " doc = np.append(doc, w_n)\n", "\n", " corpus.append(doc)\n", " return corpus, T\n", "\n", "\n", "def plot_unsmoothed():\n", " corpus, T = generate_corpus()\n", " L = LDA(T)\n", " L.train(corpus, verbose=False)\n", "\n", " fig, axes = plt.subplots(1, 2)\n", " ax1 = sns.heatmap(L.beta, xticklabels=[], yticklabels=[], ax=axes[0])\n", " ax1.set_xlabel(\"Topics\")\n", " ax1.set_ylabel(\"Words\")\n", " ax1.set_title(\"Recovered topic-word distribution\")\n", "\n", " ax2 = sns.heatmap(L.gamma, xticklabels=[], yticklabels=[], ax=axes[1])\n", " ax2.set_xlabel(\"Topics\")\n", " ax2.set_ylabel(\"Documents\")\n", " ax2.set_title(\"Recovered document-topic distribution\")\n", "\n", " plt.savefig(\"img/plot_unsmoothed.png\", dpi=300)\n", " plt.close(\"all\")\n"]} {"path": "numpy_ml/plots/gmm_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "from sklearn.datasets.samples_generator import make_blobs\n", "\n", "from scipy.stats import multivariate_normal\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"paper\", font_scale=1)\n", "\n", "from numpy_ml.gmm import GMM\n", "\n", "from matplotlib.colors import ListedColormap\n", "\n", "\n", "def plot_countour(X, x, y, z, ax, xlim, ylim):\n", " def fixed_aspect_ratio(ratio, ax):\n", " \"\"\"\n", " Set a fixed aspect ratio on matplotlib plots\n", " regardless of axis units\n", " \"\"\"\n", " xvals, yvals = ax.get_xlim(), ax.get_ylim()\n", "\n", " xrange = xvals[1] - xvals[0]\n", " yrange = yvals[1] - yvals[0]\n", " ax.set_aspect(ratio * (xrange / yrange), adjustable=\"box\")\n", "\n", " # contour the gridded data, plotting dots at the randomly spaced data points.\n", " ax.contour(x, y, z, 6, linewidths=0.5, colors=\"k\")\n", "\n", " ax.set_xlim(*xlim)\n", " ax.set_ylim(*ylim)\n", " fixed_aspect_ratio(1, ax)\n", " return ax\n", "\n", "\n", "def plot_clusters(model, X, ax):\n", " C = model.C\n", "\n", " xmin = min(X[:, 0]) - 0.1 * (max(X[:, 0]) - min(X[:, 0]))\n", " xmax = max(X[:, 0]) + 0.1 * (max(X[:, 0]) - min(X[:, 0]))\n", " ymin = min(X[:, 1]) - 0.1 * (max(X[:, 1]) - min(X[:, 1]))\n", " ymax = max(X[:, 1]) + 0.1 * (max(X[:, 1]) - min(X[:, 1]))\n", "\n", " for c in range(C):\n", " rv = multivariate_normal(model.mu[c], model.sigma[c], allow_singular=True)\n", "\n", " x = np.linspace(xmin, xmax, 500)\n", " y = np.linspace(ymin, ymax, 500)\n", "\n", " X1, Y1 = np.meshgrid(x, y)\n", " xy = np.column_stack([X1.flat, Y1.flat])\n", "\n", " # density values at the grid points\n", " Z = rv.pdf(xy).reshape(X1.shape)\n", " ax = plot_countour(X, X1, Y1, Z, ax=ax, xlim=(xmin, xmax), ylim=(ymin, ymax))\n", " ax.plot(model.mu[c, 0], model.mu[c, 1], \"ro\")\n", "\n", " # plot data points\n", " cm = ListedColormap(sns.color_palette().as_hex())\n", " labels = model.Q.argmax(1)\n", " uniq = set(labels)\n", " for i in uniq:\n", " ax.scatter(X[labels == i, 0], X[labels == i, 1], c=cm.colors[i - 1], s=30)\n", " return ax\n", "\n", "\n", "def plot():\n", " fig, axes = plt.subplots(4, 4)\n", " fig.set_size_inches(10, 10)\n", " for i, ax in enumerate(axes.flatten()):\n", " n_ex = 150\n", " n_in = 2\n", " n_classes = np.random.randint(2, 4)\n", " X, y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=i\n", " )\n", " X -= X.mean(axis=0)\n", "\n", " # take best fit over 10 runs\n", " best_elbo = -np.inf\n", " for k in range(10):\n", " _G = GMM(C=n_classes, seed=k * 3)\n", " ret = _G.fit(X, max_iter=100, verbose=False)\n", " while ret != 0:\n", " print(\"Components collapsed; Refitting\")\n", " ret = _G.fit(X, max_iter=100, verbose=False)\n", "\n", " if _G.best_elbo > best_elbo:\n", " best_elbo = _G.best_elbo\n", " G = _G\n", "\n", " ax = plot_clusters(G, X, ax)\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", " ax.set_title(\"# Classes: {}; Final VLB: {:.2f}\".format(n_classes, G.best_elbo))\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"img/plot.png\", dpi=300)\n", " plt.close(\"all\")\n"]} {"path": "numpy_ml/plots/nn_activations_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"notebook\", font_scale=0.7)\n", "\n", "from numpy_ml.neural_nets.activations import (\n", " Affine,\n", " ReLU,\n", " LeakyReLU,\n", " Tanh,\n", " Sigmoid,\n", " ELU,\n", " Exponential,\n", " SELU,\n", " HardSigmoid,\n", " SoftPlus,\n", ")\n", "\n", "\n", "def plot_activations():\n", " fig, axes = plt.subplots(2, 5, sharex=True, sharey=True)\n", " fns = [\n", " Affine(),\n", " Tanh(),\n", " Sigmoid(),\n", " ReLU(),\n", " LeakyReLU(),\n", " ELU(),\n", " Exponential(),\n", " SELU(),\n", " HardSigmoid(),\n", " SoftPlus(),\n", " ]\n", "\n", " for ax, fn in zip(axes.flatten(), fns):\n", " X = np.linspace(-3, 3, 100).astype(float).reshape(100, 1)\n", " ax.plot(X, fn(X), label=r\"$y$\", alpha=1.0)\n", " ax.plot(X, fn.grad(X), label=r\"$\\frac{dy}{dx}$\", alpha=1.0)\n", " ax.plot(X, fn.grad2(X), label=r\"$\\frac{d^2 y}{dx^2}$\", alpha=1.0)\n", " ax.hlines(0, -3, 3, lw=1, linestyles=\"dashed\", color=\"k\")\n", " ax.vlines(0, -1.2, 1.2, lw=1, linestyles=\"dashed\", color=\"k\")\n", " ax.set_ylim(-1.1, 1.1)\n", " ax.set_xlim(-3, 3)\n", " ax.set_xticks([])\n", " ax.set_yticks([-1, 0, 1])\n", " ax.xaxis.set_visible(False)\n", " # ax.yaxis.set_visible(False)\n", " ax.set_title(\"{}\".format(fn))\n", " ax.legend(frameon=False)\n", " sns.despine(left=True, bottom=True)\n", "\n", " fig.set_size_inches(10, 5)\n", " plt.tight_layout()\n", " plt.savefig(\"img/plot.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "if __name__ == \"__main__\":\n", " plot_activations()\n"]} {"path": "numpy_ml/plots/nn_schedulers_plots.py", "content": ["# flake8: noqa\n", "\n", "import time\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"notebook\", font_scale=0.7)\n", "\n", "from numpy_ml.neural_nets.schedulers import (\n", " ConstantScheduler,\n", " ExponentialScheduler,\n", " NoamScheduler,\n", " KingScheduler,\n", ")\n", "\n", "\n", "def king_loss_fn(x):\n", " if x <= 250:\n", " return -0.25 * x + 82.50372665317208\n", " elif 250 < x <= 600:\n", " return 20.00372665317208\n", " elif 600 < x <= 700:\n", " return -0.2 * x + 140.00372665317207\n", " else:\n", " return 0.003726653172066108\n", "\n", "\n", "def plot_schedulers():\n", " fig, axes = plt.subplots(2, 2)\n", " schedulers = [\n", " (\n", " [ConstantScheduler(lr=0.01), \"lr=1e-2\"],\n", " [ConstantScheduler(lr=0.008), \"lr=8e-3\"],\n", " [ConstantScheduler(lr=0.006), \"lr=6e-3\"],\n", " [ConstantScheduler(lr=0.004), \"lr=4e-3\"],\n", " [ConstantScheduler(lr=0.002), \"lr=2e-3\"],\n", " ),\n", " (\n", " [\n", " ExponentialScheduler(\n", " lr=0.01, stage_length=250, staircase=False, decay=0.4\n", " ),\n", " \"lr=0.01, stage=250, stair=False, decay=0.4\",\n", " ],\n", " [\n", " ExponentialScheduler(\n", " lr=0.01, stage_length=250, staircase=True, decay=0.4\n", " ),\n", " \"lr=0.01, stage=250, stair=True, decay=0.4\",\n", " ],\n", " [\n", " ExponentialScheduler(\n", " lr=0.01, stage_length=125, staircase=True, decay=0.1\n", " ),\n", " \"lr=0.01, stage=125, stair=True, decay=0.1\",\n", " ],\n", " [\n", " ExponentialScheduler(\n", " lr=0.001, stage_length=250, staircase=False, decay=0.1\n", " ),\n", " \"lr=0.001, stage=250, stair=False, decay=0.1\",\n", " ],\n", " [\n", " ExponentialScheduler(\n", " lr=0.001, stage_length=125, staircase=False, decay=0.8\n", " ),\n", " \"lr=0.001, stage=125, stair=False, decay=0.8\",\n", " ],\n", " [\n", " ExponentialScheduler(\n", " lr=0.01, stage_length=250, staircase=False, decay=0.01\n", " ),\n", " \"lr=0.01, stage=250, stair=False, decay=0.01\",\n", " ],\n", " ),\n", " (\n", " [\n", " NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=250),\n", " \"dim=512, scale=1, warmup=250\",\n", " ],\n", " [\n", " NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=250),\n", " \"dim=256, scale=1, warmup=250\",\n", " ],\n", " [\n", " NoamScheduler(model_dim=512, scale_factor=1, warmup_steps=500),\n", " \"dim=512, scale=1, warmup=500\",\n", " ],\n", " [\n", " NoamScheduler(model_dim=256, scale_factor=1, warmup_steps=500),\n", " \"dim=512, scale=1, warmup=500\",\n", " ],\n", " [\n", " NoamScheduler(model_dim=512, scale_factor=2, warmup_steps=500),\n", " \"dim=512, scale=2, warmup=500\",\n", " ],\n", " [\n", " NoamScheduler(model_dim=512, scale_factor=0.5, warmup_steps=500),\n", " \"dim=512, scale=0.5, warmup=500\",\n", " ],\n", " ),\n", " (\n", " # [\n", " # KingScheduler(initial_lr=0.01, patience=100, decay=0.1),\n", " # \"lr=0.01, patience=100, decay=0.8\",\n", " # ],\n", " # [\n", " # KingScheduler(initial_lr=0.01, patience=300, decay=0.999),\n", " # \"lr=0.01, patience=300, decay=0.999\",\n", " # ],\n", " [\n", " KingScheduler(initial_lr=0.009, patience=150, decay=0.995),\n", " \"lr=0.009, patience=150, decay=0.9999\",\n", " ],\n", " [\n", " KingScheduler(initial_lr=0.008, patience=100, decay=0.995),\n", " \"lr=0.008, patience=100, decay=0.995\",\n", " ],\n", " [\n", " KingScheduler(initial_lr=0.007, patience=50, decay=0.995),\n", " \"lr=0.007, patience=50, decay=0.995\",\n", " ],\n", " [\n", " KingScheduler(initial_lr=0.005, patience=25, decay=0.9),\n", " \"lr=0.005, patience=25, decay=0.99\",\n", " ],\n", " ),\n", " ]\n", "\n", " for ax, schs, title in zip(\n", " axes.flatten(), schedulers, [\"Constant\", \"Exponential\", \"Noam\", \"King\"]\n", " ):\n", " t0 = time.time()\n", " print(\"Running {} scheduler\".format(title))\n", " X = np.arange(1, 1000)\n", " loss = np.array([king_loss_fn(x) for x in X])\n", "\n", " # scale loss to fit on same axis as lr\n", " scale = 0.01 / loss[0]\n", " loss *= scale\n", "\n", " if title == \"King\":\n", " ax.plot(X, loss, ls=\":\", label=\"Loss\")\n", "\n", " for sc, lg in schs:\n", " Y = np.array([sc(x, ll) for x, ll in zip(X, loss)])\n", " ax.plot(X, Y, label=lg, alpha=0.6)\n", "\n", " ax.legend(fontsize=5)\n", " ax.set_xlabel(\"Steps\")\n", " ax.set_ylabel(\"Learning rate\")\n", " ax.set_title(\"{} scheduler\".format(title))\n", " print(\n", " \"Finished plotting {} runs of {} in {:.2f}s\".format(\n", " len(schs), title, time.time() - t0\n", " )\n", " )\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"plot.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "if __name__ == \"__main__\":\n", " plot_schedulers()\n"]} {"path": "numpy_ml/plots/hmm_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "from matplotlib import pyplot as plt\n", "\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"notebook\", font_scale=0.8)\n", "\n", "from hmmlearn.hmm import MultinomialHMM as MHMM\n", "from numpy_ml.hmm import MultinomialHMM\n", "\n", "\n", "def generate_training_data(params, n_steps=500, n_examples=15):\n", " hmm = MultinomialHMM(A=params[\"A\"], B=params[\"B\"], pi=params[\"pi\"])\n", "\n", " # generate a new sequence\n", " observations = []\n", " for i in range(n_examples):\n", " latent, obs = hmm.generate(\n", " n_steps, params[\"latent_states\"], params[\"obs_types\"]\n", " )\n", " assert len(latent) == len(obs) == n_steps\n", " observations.append(obs)\n", "\n", " observations = np.array(observations)\n", " return observations\n", "\n", "\n", "def default_hmm():\n", " obs_types = [0, 1, 2, 3]\n", " latent_states = [\"H\", \"C\"]\n", "\n", " # derived variables\n", " V = len(obs_types)\n", " N = len(latent_states)\n", "\n", " # define a very simple HMM with T=3 observations\n", " O = np.array([1, 3, 1]).reshape(1, -1)\n", " A = np.array([[0.9, 0.1], [0.5, 0.5]])\n", " B = np.array([[0.2, 0.7, 0.09, 0.01], [0.1, 0.0, 0.8, 0.1]])\n", " pi = np.array([0.75, 0.25])\n", "\n", " return {\n", " \"latent_states\": latent_states,\n", " \"obs_types\": obs_types,\n", " \"V\": V,\n", " \"N\": N,\n", " \"O\": O,\n", " \"A\": A,\n", " \"B\": B,\n", " \"pi\": pi,\n", " }\n", "\n", "\n", "def plot_matrices(params, best, best_theirs):\n", " cmap = \"copper\"\n", " ll_mine, best = best\n", " ll_theirs, best_theirs = best_theirs\n", "\n", " fig, axes = plt.subplots(3, 3)\n", " axes = {\n", " \"A\": [axes[0, 0], axes[0, 1], axes[0, 2]],\n", " \"B\": [axes[1, 0], axes[1, 1], axes[1, 2]],\n", " \"pi\": [axes[2, 0], axes[2, 1], axes[2, 2]],\n", " }\n", "\n", " for k, tt in [(\"A\", \"Transition\"), (\"B\", \"Emission\"), (\"pi\", \"Prior\")]:\n", " true_ax, est_ax, est_theirs_ax = axes[k]\n", " true, est, est_theirs = params[k], best[k], best_theirs[k]\n", "\n", " if k == \"pi\":\n", " true = true.reshape(-1, 1)\n", " est = est.reshape(-1, 1)\n", " est_theirs = est_theirs.reshape(-1, 1)\n", "\n", " true_ax = sns.heatmap(\n", " true,\n", " vmin=0.0,\n", " vmax=1.0,\n", " fmt=\".2f\",\n", " cmap=cmap,\n", " cbar=False,\n", " annot=True,\n", " ax=true_ax,\n", " xticklabels=[],\n", " yticklabels=[],\n", " linewidths=0.25,\n", " )\n", "\n", " est_ax = sns.heatmap(\n", " est,\n", " vmin=0.0,\n", " vmax=1.0,\n", " fmt=\".2f\",\n", " ax=est_ax,\n", " cmap=cmap,\n", " annot=True,\n", " cbar=False,\n", " xticklabels=[],\n", " yticklabels=[],\n", " linewidths=0.25,\n", " )\n", "\n", " est_theirs_ax = sns.heatmap(\n", " est_theirs,\n", " vmin=0.0,\n", " vmax=1.0,\n", " fmt=\".2f\",\n", " cmap=cmap,\n", " annot=True,\n", " cbar=False,\n", " xticklabels=[],\n", " yticklabels=[],\n", " linewidths=0.25,\n", " ax=est_theirs_ax,\n", " )\n", "\n", " true_ax.set_title(\"{} (True)\".format(tt))\n", " est_ax.set_title(\"{} (Mine)\".format(tt))\n", " est_theirs_ax.set_title(\"{} (hmmlearn)\".format(tt))\n", " fig.suptitle(\"LL (mine): {:.2f}, LL (hmmlearn): {:.2f}\".format(ll_mine, ll_theirs))\n", " plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n", " plt.savefig(\"img/plot.png\", dpi=300)\n", " plt.close()\n", "\n", "\n", "def test_HMM():\n", " np.random.seed(12345)\n", " np.set_printoptions(precision=5, suppress=True)\n", "\n", " P = default_hmm()\n", " ls, obs = P[\"latent_states\"], P[\"obs_types\"]\n", "\n", " # generate a new sequence\n", " O = generate_training_data(P, n_steps=30, n_examples=25)\n", "\n", " tol = 1e-5\n", " n_runs = 5\n", " best, best_theirs = (-np.inf, []), (-np.inf, [])\n", " for _ in range(n_runs):\n", " hmm = MultinomialHMM()\n", " A_, B_, pi_ = hmm.fit(O, ls, obs, tol=tol, verbose=True)\n", "\n", " theirs = MHMM(\n", " tol=tol,\n", " verbose=True,\n", " n_iter=int(1e9),\n", " transmat_prior=1,\n", " startprob_prior=1,\n", " algorithm=\"viterbi\",\n", " n_components=len(ls),\n", " )\n", "\n", " O_flat = O.reshape(1, -1).flatten().reshape(-1, 1)\n", " theirs = theirs.fit(O_flat, lengths=[O.shape[1]] * O.shape[0])\n", "\n", " hmm2 = MultinomialHMM(A=A_, B=B_, pi=pi_)\n", " like = np.sum([hmm2.log_likelihood(obs) for obs in O])\n", " like_theirs = theirs.score(O_flat, lengths=[O.shape[1]] * O.shape[0])\n", "\n", " if like > best[0]:\n", " best = (like, {\"A\": A_, \"B\": B_, \"pi\": pi_})\n", "\n", " if like_theirs > best_theirs[0]:\n", " best_theirs = (\n", " like_theirs,\n", " {\n", " \"A\": theirs.transmat_,\n", " \"B\": theirs.emissionprob_,\n", " \"pi\": theirs.startprob_,\n", " },\n", " )\n", " print(\"Final log likelihood of sequence: {:.5f}\".format(best[0]))\n", " print(\"Final log likelihood of sequence (theirs): {:.5f}\".format(best_theirs[0]))\n", " plot_matrices(P, best, best_theirs)\n"]} {"path": "numpy_ml/plots/trees_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn.datasets import make_blobs, make_regression\n", "from sklearn.model_selection import train_test_split\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "import seaborn as sns\n", "\n", "sns.set_style(\"white\")\n", "sns.set_context(\"paper\", font_scale=0.9)\n", "\n", "from numpy_ml.trees import GradientBoostedDecisionTree, DecisionTree, RandomForest\n", "\n", "\n", "def plot():\n", " fig, axes = plt.subplots(4, 4)\n", " fig.set_size_inches(10, 10)\n", " for ax in axes.flatten():\n", " n_ex = 100\n", " n_trees = 50\n", " n_feats = np.random.randint(2, 100)\n", " max_depth_d = np.random.randint(1, 100)\n", " max_depth_r = np.random.randint(1, 10)\n", "\n", " classifier = np.random.choice([True, False])\n", " if classifier:\n", " # create classification problem\n", " n_classes = np.random.randint(2, 10)\n", " X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)\n", " n_feats = min(n_feats, X.shape[1])\n", "\n", " # initialize model\n", " def loss(yp, y):\n", " return accuracy_score(yp, y)\n", "\n", " # initialize model\n", " criterion = np.random.choice([\"entropy\", \"gini\"])\n", " mine = RandomForest(\n", " classifier=classifier,\n", " n_feats=n_feats,\n", " n_trees=n_trees,\n", " criterion=criterion,\n", " max_depth=max_depth_r,\n", " )\n", " mine_d = DecisionTree(\n", " criterion=criterion, max_depth=max_depth_d, classifier=classifier\n", " )\n", " mine_g = GradientBoostedDecisionTree(\n", " n_trees=n_trees,\n", " max_depth=max_depth_d,\n", " classifier=classifier,\n", " learning_rate=1,\n", " loss=\"crossentropy\",\n", " step_size=\"constant\",\n", " split_criterion=criterion,\n", " )\n", "\n", " else:\n", " # create regeression problem\n", " X, Y = make_regression(n_samples=n_ex, n_features=1)\n", " X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)\n", " n_feats = min(n_feats, X.shape[1])\n", "\n", " # initialize model\n", " criterion = \"mse\"\n", " loss = mean_squared_error\n", " mine = RandomForest(\n", " criterion=criterion,\n", " n_feats=n_feats,\n", " n_trees=n_trees,\n", " max_depth=max_depth_r,\n", " classifier=classifier,\n", " )\n", " mine_d = DecisionTree(\n", " criterion=criterion, max_depth=max_depth_d, classifier=classifier\n", " )\n", " mine_g = GradientBoostedDecisionTree(\n", " n_trees=n_trees,\n", " max_depth=max_depth_d,\n", " classifier=classifier,\n", " learning_rate=1,\n", " loss=\"mse\",\n", " step_size=\"adaptive\",\n", " split_criterion=criterion,\n", " )\n", "\n", " # fit 'em\n", " mine.fit(X, Y)\n", " mine_d.fit(X, Y)\n", " mine_g.fit(X, Y)\n", "\n", " # get preds on test set\n", " y_pred_mine_test = mine.predict(X_test)\n", " y_pred_mine_test_d = mine_d.predict(X_test)\n", " y_pred_mine_test_g = mine_g.predict(X_test)\n", "\n", " loss_mine_test = loss(y_pred_mine_test, Y_test)\n", " loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)\n", " loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)\n", "\n", " if classifier:\n", " entries = [\n", " (\"RF\", loss_mine_test, y_pred_mine_test),\n", " (\"DT\", loss_mine_test_d, y_pred_mine_test_d),\n", " (\"GB\", loss_mine_test_g, y_pred_mine_test_g),\n", " ]\n", " (lbl, test_loss, preds) = entries[np.random.randint(3)]\n", " ax.set_title(\"{} Accuracy: {:.2f}%\".format(lbl, test_loss * 100))\n", " for i in np.unique(Y_test):\n", " ax.scatter(\n", " X_test[preds == i, 0].flatten(),\n", " X_test[preds == i, 1].flatten(),\n", " # s=0.5,\n", " )\n", " else:\n", " X_ax = np.linspace(\n", " np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100\n", " ).reshape(-1, 1)\n", " y_pred_mine_test = mine.predict(X_ax)\n", " y_pred_mine_test_d = mine_d.predict(X_ax)\n", " y_pred_mine_test_g = mine_g.predict(X_ax)\n", "\n", " ax.scatter(X_test.flatten(), Y_test.flatten(), c=\"b\", alpha=0.5)\n", " # s=0.5)\n", " ax.plot(\n", " X_ax.flatten(),\n", " y_pred_mine_test_g.flatten(),\n", " # linewidth=0.5,\n", " label=\"GB\".format(n_trees, n_feats, max_depth_d),\n", " color=\"red\",\n", " )\n", " ax.plot(\n", " X_ax.flatten(),\n", " y_pred_mine_test.flatten(),\n", " # linewidth=0.5,\n", " label=\"RF\".format(n_trees, n_feats, max_depth_r),\n", " color=\"cornflowerblue\",\n", " )\n", " ax.plot(\n", " X_ax.flatten(),\n", " y_pred_mine_test_d.flatten(),\n", " # linewidth=0.5,\n", " label=\"DT\".format(max_depth_d),\n", " color=\"yellowgreen\",\n", " )\n", " ax.set_title(\n", " \"GB: {:.1f} / RF: {:.1f} / DT: {:.1f} \".format(\n", " loss_mine_test_g, loss_mine_test, loss_mine_test_d\n", " )\n", " )\n", " ax.legend()\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", " plt.savefig(\"plot.png\", dpi=300)\n", " plt.close(\"all\")\n"]} {"path": "numpy_ml/plots/nonparametric_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"paper\", font_scale=0.5)\n", "\n", "from numpy_ml.nonparametric import GPRegression, KNN, KernelRegression\n", "from numpy_ml.linear_models.lm import LinearRegression\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "def random_regression_problem(n_ex, n_in, n_out, d=3, intercept=0, std=1, seed=0):\n", " coef = np.random.uniform(0, 50, size=d)\n", " coef[-1] = intercept\n", "\n", " y = []\n", " X = np.random.uniform(-100, 100, size=(n_ex, n_in))\n", " for x in X:\n", " val = np.polyval(coef, x) + np.random.normal(0, std)\n", " y.append(val)\n", " y = np.array(y)\n", "\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.3, random_state=seed\n", " )\n", " return X_train, y_train, X_test, y_test, coef\n", "\n", "\n", "def plot_regression():\n", " np.random.seed(12345)\n", " fig, axes = plt.subplots(4, 4)\n", " for i, ax in enumerate(axes.flatten()):\n", " n_in = 1\n", " n_out = 1\n", " d = np.random.randint(1, 5)\n", " n_ex = np.random.randint(5, 500)\n", " std = np.random.randint(0, 1000)\n", " intercept = np.random.rand() * np.random.randint(-300, 300)\n", " X_train, y_train, X_test, y_test, coefs = random_regression_problem(\n", " n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i\n", " )\n", "\n", " LR = LinearRegression(fit_intercept=True)\n", " LR.fit(X_train, y_train)\n", " y_pred = LR.predict(X_test)\n", " loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)\n", "\n", " d = 3\n", " best_loss = np.inf\n", " for gamma in np.linspace(1e-10, 1, 100):\n", " for c0 in np.linspace(-1, 1000, 100):\n", " kernel = \"PolynomialKernel(d={}, gamma={}, c0={})\".format(d, gamma, c0)\n", " KR_poly = KernelRegression(kernel=kernel)\n", " KR_poly.fit(X_train, y_train)\n", " y_pred_poly = KR_poly.predict(X_test)\n", " loss_poly = np.mean((y_test.flatten() - y_pred_poly.flatten()) ** 2)\n", " if loss_poly <= best_loss:\n", " KR_poly_best = kernel\n", " best_loss = loss_poly\n", "\n", " print(\"Best kernel: {} || loss: {:.4f}\".format(KR_poly_best, best_loss))\n", " KR_poly = KernelRegression(kernel=KR_poly_best)\n", " KR_poly.fit(X_train, y_train)\n", "\n", " KR_rbf = KernelRegression(kernel=\"RBFKernel(sigma=1)\")\n", " KR_rbf.fit(X_train, y_train)\n", " y_pred_rbf = KR_rbf.predict(X_test)\n", " loss_rbf = np.mean((y_test.flatten() - y_pred_rbf.flatten()) ** 2)\n", "\n", " xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))\n", " xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))\n", " X_plot = np.linspace(xmin, xmax, 100)\n", " y_plot = LR.predict(X_plot)\n", " y_plot_poly = KR_poly.predict(X_plot)\n", " y_plot_rbf = KR_rbf.predict(X_plot)\n", "\n", " ax.scatter(X_test, y_test, alpha=0.5)\n", " ax.plot(X_plot, y_plot, label=\"OLS\", alpha=0.5)\n", " ax.plot(\n", " X_plot, y_plot_poly, label=\"KR (poly kernel, d={})\".format(d), alpha=0.5\n", " )\n", " ax.plot(X_plot, y_plot_rbf, label=\"KR (rbf kernel)\", alpha=0.5)\n", " ax.legend()\n", " # ax.set_title(\n", " # \"MSE\\nLR: {:.2f} KR (poly): {:.2f}\\nKR (rbf): {:.2f}\".format(\n", " # loss, loss_poly, loss_rbf\n", " # )\n", " # )\n", "\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"img/kr_plots.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "def plot_knn():\n", " np.random.seed(12345)\n", " fig, axes = plt.subplots(4, 4)\n", " for i, ax in enumerate(axes.flatten()):\n", " n_in = 1\n", " n_out = 1\n", " d = np.random.randint(1, 5)\n", " n_ex = np.random.randint(5, 500)\n", " std = np.random.randint(0, 1000)\n", " intercept = np.random.rand() * np.random.randint(-300, 300)\n", " X_train, y_train, X_test, y_test, coefs = random_regression_problem(\n", " n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i\n", " )\n", "\n", " LR = LinearRegression(fit_intercept=True)\n", " LR.fit(X_train, y_train)\n", " y_pred = LR.predict(X_test)\n", " loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)\n", "\n", " knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights=\"uniform\")\n", " knn_1.fit(X_train, y_train)\n", " y_pred_1 = knn_1.predict(X_test)\n", " loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten()) ** 2)\n", "\n", " knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights=\"uniform\")\n", " knn_5.fit(X_train, y_train)\n", " y_pred_5 = knn_5.predict(X_test)\n", " loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten()) ** 2)\n", "\n", " knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights=\"uniform\")\n", " knn_10.fit(X_train, y_train)\n", " y_pred_10 = knn_10.predict(X_test)\n", " loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten()) ** 2)\n", "\n", " xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))\n", " xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))\n", " X_plot = np.linspace(xmin, xmax, 100)\n", " y_plot = LR.predict(X_plot)\n", " y_plot_1 = knn_1.predict(X_plot)\n", " y_plot_5 = knn_5.predict(X_plot)\n", " y_plot_10 = knn_10.predict(X_plot)\n", "\n", " ax.scatter(X_test, y_test, alpha=0.5)\n", " ax.plot(X_plot, y_plot, label=\"OLS\", alpha=0.5)\n", " ax.plot(X_plot, y_plot_1, label=\"KNN (k=1)\", alpha=0.5)\n", " ax.plot(X_plot, y_plot_5, label=\"KNN (k=5)\", alpha=0.5)\n", " ax.plot(X_plot, y_plot_10, label=\"KNN (k=10)\", alpha=0.5)\n", " ax.legend()\n", " # ax.set_title(\n", " # \"MSE\\nLR: {:.2f} KR (poly): {:.2f}\\nKR (rbf): {:.2f}\".format(\n", " # loss, loss_poly, loss_rbf\n", " # )\n", " # )\n", "\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"img/knn_plots.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "def plot_gp():\n", " np.random.seed(12345)\n", " sns.set_context(\"paper\", font_scale=0.65)\n", "\n", " X_test = np.linspace(-10, 10, 100)\n", " X_train = np.array([-3, 0, 7, 1, -9])\n", " y_train = np.sin(X_train)\n", "\n", " fig, axes = plt.subplots(2, 2)\n", " alphas = [0, 1e-10, 1e-5, 1]\n", " for ix, (ax, alpha) in enumerate(zip(axes.flatten(), alphas)):\n", " G = GPRegression(kernel=\"RBFKernel\", alpha=alpha)\n", " G.fit(X_train, y_train)\n", " y_pred, conf = G.predict(X_test)\n", "\n", " ax.plot(X_train, y_train, \"rx\", label=\"observed\")\n", " ax.plot(X_test, np.sin(X_test), label=\"true fn\")\n", " ax.plot(X_test, y_pred, \"--\", label=\"MAP (alpha={})\".format(alpha))\n", " ax.fill_between(X_test, y_pred + conf, y_pred - conf, alpha=0.1)\n", " ax.set_xticks([])\n", " ax.set_yticks([])\n", " sns.despine()\n", "\n", " ax.legend()\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"img/gp_alpha.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "def plot_gp_dist():\n", " np.random.seed(12345)\n", " sns.set_context(\"paper\", font_scale=0.95)\n", "\n", " X_test = np.linspace(-10, 10, 100)\n", " X_train = np.array([-3, 0, 7, 1, -9])\n", " y_train = np.sin(X_train)\n", "\n", " fig, axes = plt.subplots(1, 3)\n", " G = GPRegression(kernel=\"RBFKernel\", alpha=0)\n", " G.fit(X_train, y_train)\n", "\n", " y_pred_prior = G.sample(X_test, 3, \"prior\")\n", " y_pred_posterior = G.sample(X_test, 3, \"posterior_predictive\")\n", "\n", " for prior_sample in y_pred_prior:\n", " axes[0].plot(X_test, prior_sample.ravel(), lw=1)\n", " axes[0].set_title(\"Prior samples\")\n", " axes[0].set_xticks([])\n", " axes[0].set_yticks([])\n", "\n", " for post_sample in y_pred_posterior:\n", " axes[1].plot(X_test, post_sample.ravel(), lw=1)\n", " axes[1].plot(X_train, y_train, \"ko\", ms=1.2)\n", " axes[1].set_title(\"Posterior samples\")\n", " axes[1].set_xticks([])\n", " axes[1].set_yticks([])\n", "\n", " y_pred, conf = G.predict(X_test)\n", "\n", " axes[2].plot(X_test, np.sin(X_test), lw=1, label=\"true function\")\n", " axes[2].plot(X_test, y_pred, lw=1, label=\"MAP estimate\")\n", " axes[2].fill_between(X_test, y_pred + conf, y_pred - conf, alpha=0.1)\n", " axes[2].plot(X_train, y_train, \"ko\", ms=1.2, label=\"observed\")\n", " axes[2].legend(fontsize=\"x-small\")\n", " axes[2].set_title(\"Posterior mean\")\n", " axes[2].set_xticks([])\n", " axes[2].set_yticks([])\n", "\n", " fig.set_size_inches(6, 2)\n", " plt.tight_layout()\n", " plt.savefig(\"img/gp_dist.png\", dpi=300)\n", " plt.close(\"all\")\n"]} {"path": "numpy_ml/plots/bandit_plots.py", "content": ["\"\"\"Miscellaneous plots for multi-arm bandit validation\"\"\"\n", "\n", "from collections import namedtuple\n", "\n", "import numpy as np\n", "\n", "from numpy_ml.bandits import (\n", " MultinomialBandit,\n", " BernoulliBandit,\n", " ShortestPathBandit,\n", " ContextualLinearBandit,\n", ")\n", "from numpy_ml.bandits.trainer import BanditTrainer\n", "from numpy_ml.bandits.policies import (\n", " EpsilonGreedy,\n", " UCB1,\n", " ThompsonSamplingBetaBinomial,\n", " LinUCB,\n", ")\n", "from numpy_ml.utils.graphs import random_DAG, DiGraph, Edge\n", "\n", "\n", "def random_multinomial_mab(n_arms=10, n_choices_per_arm=5, reward_range=[0, 1]):\n", " \"\"\"Generate a random multinomial multi-armed bandit environemt\"\"\"\n", " payoffs = []\n", " payoff_probs = []\n", " lo, hi = reward_range\n", " for a in range(n_arms):\n", " p = np.random.uniform(size=n_choices_per_arm)\n", " p = p / p.sum()\n", " r = np.random.uniform(low=lo, high=hi, size=n_choices_per_arm)\n", "\n", " payoffs.append(list(r))\n", " payoff_probs.append(list(p))\n", "\n", " return MultinomialBandit(payoffs, payoff_probs)\n", "\n", "\n", "def random_bernoulli_mab(n_arms=10):\n", " \"\"\"Generate a random Bernoulli multi-armed bandit environemt\"\"\"\n", " p = np.random.uniform(size=n_arms)\n", " payoff_probs = p / p.sum()\n", " return BernoulliBandit(payoff_probs)\n", "\n", "\n", "def plot_epsilon_greedy_multinomial_payoff():\n", " \"\"\"\n", " Evaluate an epsilon-greedy policy on a random multinomial bandit\n", " problem\n", " \"\"\"\n", " np.random.seed(12345)\n", " N = np.random.randint(2, 30) # n arms\n", " K = np.random.randint(2, 10) # n payoffs / arm\n", " ep_length = 1\n", "\n", " rrange = [0, 1]\n", " n_duplicates = 5\n", " n_episodes = 5000\n", "\n", " mab = random_multinomial_mab(N, K, rrange)\n", " policy = EpsilonGreedy(epsilon=0.05, ev_prior=rrange[1] / 2)\n", " policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)\n", "\n", "\n", "def plot_ucb1_multinomial_payoff():\n", " \"\"\"Evaluate the UCB1 policy on a multinomial bandit environment\"\"\"\n", " np.random.seed(12345)\n", " N = np.random.randint(2, 30) # n arms\n", " K = np.random.randint(2, 10) # n payoffs / arm\n", " ep_length = 1\n", "\n", " C = 1\n", " rrange = [0, 1]\n", " n_duplicates = 5\n", " n_episodes = 5000\n", "\n", " mab = random_multinomial_mab(N, K, rrange)\n", " policy = UCB1(C=C, ev_prior=rrange[1] / 2)\n", " policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)\n", "\n", "\n", "def plot_thompson_sampling_beta_binomial_payoff():\n", " \"\"\"\n", " Evaluate the ThompsonSamplingBetaBinomial policy on a random Bernoulli\n", " multi-armed bandit.\n", " \"\"\"\n", " np.random.seed(12345)\n", " N = np.random.randint(2, 30) # n arms\n", " ep_length = 1\n", "\n", " n_duplicates = 5\n", " n_episodes = 5000\n", "\n", " mab = random_bernoulli_mab(N)\n", " policy = ThompsonSamplingBetaBinomial(alpha=1, beta=1)\n", " policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)\n", "\n", "\n", "def plot_lin_ucb():\n", " \"\"\"Plot the linUCB policy on a contextual linear bandit problem\"\"\"\n", " np.random.seed(12345)\n", " ep_length = 1\n", " K = np.random.randint(2, 25)\n", " D = np.random.randint(2, 10)\n", "\n", " n_duplicates = 5\n", " n_episodes = 5000\n", "\n", " cmab = ContextualLinearBandit(K, D, 1)\n", " policy = LinUCB(alpha=1)\n", " policy = BanditTrainer().train(policy, cmab, ep_length, n_episodes, n_duplicates)\n", "\n", "\n", "def plot_ucb1_gaussian_shortest_path():\n", " \"\"\"\n", " Plot the UCB1 policy on a graph shortest path problem each edge weight\n", " drawn from an independent univariate Gaussian\n", " \"\"\"\n", " np.random.seed(12345)\n", "\n", " ep_length = 1\n", " n_duplicates = 5\n", " n_episodes = 5000\n", " p = np.random.rand()\n", " n_vertices = np.random.randint(5, 15)\n", "\n", " Gaussian = namedtuple(\"Gaussian\", [\"mean\", \"variance\", \"EV\", \"sample\"])\n", "\n", " # create randomly-weighted edges\n", " print(\"Building graph\")\n", " E = []\n", " G = random_DAG(n_vertices, p)\n", " V = G.vertices\n", " for e in G.edges:\n", " mean, var = np.random.uniform(0, 1), np.random.uniform(0, 1)\n", " w = lambda: np.random.normal(mean, var) # noqa: E731\n", " rv = Gaussian(mean, var, mean, w)\n", " E.append(Edge(e.fr, e.to, rv))\n", "\n", " G = DiGraph(V, E)\n", " while not G.path_exists(V[0], V[-1]):\n", " print(\"Skipping\")\n", " idx = np.random.randint(0, len(V))\n", " V[idx], V[-1] = V[-1], V[idx]\n", "\n", " mab = ShortestPathBandit(G, V[0], V[-1])\n", " policy = UCB1(C=1, ev_prior=0.5)\n", " policy = BanditTrainer().train(policy, mab, ep_length, n_episodes, n_duplicates)\n", "\n", "\n", "def plot_comparison():\n", " \"\"\"\n", " Use the BanditTrainer to compare several policies on the same bandit\n", " problem\n", " \"\"\"\n", " np.random.seed(1234)\n", " ep_length = 1\n", " K = 10\n", "\n", " n_duplicates = 5\n", " n_episodes = 5000\n", "\n", " cmab = random_bernoulli_mab(n_arms=K)\n", " policy1 = EpsilonGreedy(epsilon=0.05, ev_prior=0.5)\n", " policy2 = UCB1(C=1, ev_prior=0.5)\n", " policy3 = ThompsonSamplingBetaBinomial(alpha=1, beta=1)\n", " policies = [policy1, policy2, policy3]\n", "\n", " BanditTrainer().compare(\n", " policies, cmab, ep_length, n_episodes, n_duplicates,\n", " )\n"]} {"path": "numpy_ml/plots/rl_plots.py", "content": ["# flake8: noqa\n", "import gym\n", "\n", "from numpy_ml.rl_models.trainer import Trainer\n", "from numpy_ml.rl_models.agents import (\n", " CrossEntropyAgent,\n", " MonteCarloAgent,\n", " TemporalDifferenceAgent,\n", " DynaAgent,\n", ")\n", "\n", "\n", "def test_cross_entropy_agent():\n", " seed = 12345\n", " max_steps = 300\n", " n_episodes = 50\n", " retain_prcnt = 0.2\n", " n_samples_per_episode = 500\n", " env = gym.make(\"LunarLander-v2\")\n", "\n", " agent = CrossEntropyAgent(env, n_samples_per_episode, retain_prcnt)\n", " trainer = Trainer(agent, env)\n", " trainer.train(\n", " n_episodes, max_steps, seed=seed, plot=True, verbose=True, render_every=None\n", " )\n", "\n", "\n", "def test_monte_carlo_agent():\n", " seed = 12345\n", " max_steps = 300\n", " n_episodes = 10000\n", "\n", " epsilon = 0.05\n", " off_policy = True\n", " smooth_factor = 0.001\n", " temporal_discount = 0.95\n", " env = gym.make(\"Copy-v0\")\n", "\n", " agent = MonteCarloAgent(env, off_policy, temporal_discount, epsilon)\n", " trainer = Trainer(agent, env)\n", " trainer.train(\n", " n_episodes,\n", " max_steps,\n", " seed=seed,\n", " plot=True,\n", " verbose=True,\n", " render_every=None,\n", " smooth_factor=smooth_factor,\n", " )\n", "\n", "\n", "def test_temporal_difference_agent():\n", " seed = 12345\n", " max_steps = 200\n", " n_episodes = 5000\n", "\n", " lr = 0.4\n", " n_tilings = 10\n", " epsilon = 0.10\n", " off_policy = True\n", " grid_dims = [100, 100]\n", " smooth_factor = 0.005\n", " temporal_discount = 0.999\n", " env = gym.make(\"LunarLander-v2\")\n", " obs_max = 1\n", " obs_min = -1\n", "\n", " agent = TemporalDifferenceAgent(\n", " env,\n", " lr=lr,\n", " obs_max=obs_max,\n", " obs_min=obs_min,\n", " epsilon=epsilon,\n", " n_tilings=n_tilings,\n", " grid_dims=grid_dims,\n", " off_policy=off_policy,\n", " temporal_discount=temporal_discount,\n", " )\n", "\n", " trainer = Trainer(agent, env)\n", " trainer.train(\n", " n_episodes,\n", " max_steps,\n", " seed=seed,\n", " plot=True,\n", " verbose=True,\n", " render_every=None,\n", " smooth_factor=smooth_factor,\n", " )\n", "\n", "\n", "def test_dyna_agent():\n", " seed = 12345\n", " max_steps = 200\n", " n_episodes = 150\n", "\n", " lr = 0.4\n", " q_plus = False\n", " n_tilings = 10\n", " epsilon = 0.10\n", " grid_dims = [10, 10]\n", " smooth_factor = 0.01\n", " temporal_discount = 0.99\n", " explore_weight = 0.05\n", " n_simulated_actions = 25\n", "\n", " obs_max, obs_min = 1, -1\n", " env = gym.make(\"Taxi-v2\")\n", "\n", " agent = DynaAgent(\n", " env,\n", " lr=lr,\n", " q_plus=q_plus,\n", " obs_max=obs_max,\n", " obs_min=obs_min,\n", " epsilon=epsilon,\n", " n_tilings=n_tilings,\n", " grid_dims=grid_dims,\n", " explore_weight=explore_weight,\n", " temporal_discount=temporal_discount,\n", " n_simulated_actions=n_simulated_actions,\n", " )\n", "\n", " trainer = Trainer(agent, env)\n", " trainer.train(\n", " n_episodes,\n", " max_steps,\n", " seed=seed,\n", " plot=True,\n", " verbose=True,\n", " render_every=None,\n", " smooth_factor=smooth_factor,\n", " )\n"]} {"path": "numpy_ml/plots/ngram_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"notebook\", font_scale=1)\n", "\n", "from numpy_ml.ngram import MLENGram, AdditiveNGram, GoodTuringNGram\n", "\n", "\n", "def plot_count_models(GT, N):\n", " NC = GT._num_grams_with_count\n", " mod = GT._count_models[N]\n", " max_n = max(GT.counts[N].values())\n", " emp = [NC(n + 1, N) for n in range(max_n)]\n", " prd = [np.exp(mod.predict(np.array([n + 1]))) for n in range(max_n + 10)]\n", " plt.scatter(range(max_n), emp, c=\"r\", label=\"actual\")\n", " plt.plot(range(max_n + 10), prd, \"-\", label=\"model\")\n", " plt.ylim([-1, 100])\n", " plt.xlabel(\"Count ($r$)\")\n", " plt.ylabel(\"Count-of-counts ($N_r$)\")\n", " plt.legend()\n", " plt.savefig(\"test.png\")\n", " plt.close()\n", "\n", "\n", "def compare_probs(fp, N):\n", " MLE = MLENGram(N, unk=False, filter_punctuation=False, filter_stopwords=False)\n", " MLE.train(fp, encoding=\"utf-8-sig\")\n", "\n", " add_y, mle_y, gtt_y = [], [], []\n", " addu_y, mleu_y, gttu_y = [], [], []\n", " seen = (\"\", \"the\")\n", " unseen = (\"\", \"asdf\")\n", "\n", " GTT = GoodTuringNGram(\n", " N, conf=1.96, unk=False, filter_stopwords=False, filter_punctuation=False\n", " )\n", " GTT.train(fp, encoding=\"utf-8-sig\")\n", "\n", " gtt_prob = GTT.log_prob(seen, N)\n", " gtt_prob_u = GTT.log_prob(unseen, N)\n", "\n", " for K in np.linspace(0, 10, 20):\n", " ADD = AdditiveNGram(\n", " N, K, unk=False, filter_punctuation=False, filter_stopwords=False\n", " )\n", " ADD.train(fp, encoding=\"utf-8-sig\")\n", "\n", " add_prob = ADD.log_prob(seen, N)\n", " mle_prob = MLE.log_prob(seen, N)\n", "\n", " add_y.append(add_prob)\n", " mle_y.append(mle_prob)\n", " gtt_y.append(gtt_prob)\n", "\n", " mle_prob_u = MLE.log_prob(unseen, N)\n", " add_prob_u = ADD.log_prob(unseen, N)\n", "\n", " addu_y.append(add_prob_u)\n", " mleu_y.append(mle_prob_u)\n", " gttu_y.append(gtt_prob_u)\n", "\n", " plt.plot(np.linspace(0, 10, 20), add_y, label=\"Additive (seen ngram)\")\n", " plt.plot(np.linspace(0, 10, 20), addu_y, label=\"Additive (unseen ngram)\")\n", " # plt.plot(np.linspace(0, 10, 20), gtt_y, label=\"Good-Turing (seen ngram)\")\n", " # plt.plot(np.linspace(0, 10, 20), gttu_y, label=\"Good-Turing (unseen ngram)\")\n", " plt.plot(np.linspace(0, 10, 20), mle_y, \"--\", label=\"MLE (seen ngram)\")\n", " plt.xlabel(\"K\")\n", " plt.ylabel(\"log P(sequence)\")\n", " plt.legend()\n", " plt.savefig(\"img/add_smooth.png\")\n", " plt.close(\"all\")\n", "\n", "\n", "def plot_gt_freqs(fp):\n", " \"\"\"\n", " Draws a scatterplot of the empirical frequencies of the counted species\n", " versus their Simple Good Turing smoothed values, in rank order. Depends on\n", " pylab and matplotlib.\n", " \"\"\"\n", " MLE = MLENGram(1, filter_punctuation=False, filter_stopwords=False)\n", " MLE.train(fp, encoding=\"utf-8-sig\")\n", " counts = dict(MLE.counts[1])\n", "\n", " GT = GoodTuringNGram(1, filter_stopwords=False, filter_punctuation=False)\n", " GT.train(fp, encoding=\"utf-8-sig\")\n", "\n", " ADD = AdditiveNGram(1, 1, filter_punctuation=False, filter_stopwords=False)\n", " ADD.train(fp, encoding=\"utf-8-sig\")\n", "\n", " tot = float(sum(counts.values()))\n", " freqs = dict([(token, cnt / tot) for token, cnt in counts.items()])\n", " sgt_probs = dict([(tok, np.exp(GT.log_prob(tok, 1))) for tok in counts.keys()])\n", " as_probs = dict([(tok, np.exp(ADD.log_prob(tok, 1))) for tok in counts.keys()])\n", "\n", " X, Y = np.arange(len(freqs)), sorted(freqs.values(), reverse=True)\n", " plt.loglog(X, Y, \"k+\", alpha=0.25, label=\"MLE\")\n", "\n", " X, Y = np.arange(len(sgt_probs)), sorted(sgt_probs.values(), reverse=True)\n", " plt.loglog(X, Y, \"r+\", alpha=0.25, label=\"simple Good-Turing\")\n", "\n", " X, Y = np.arange(len(as_probs)), sorted(as_probs.values(), reverse=True)\n", " plt.loglog(X, Y, \"b+\", alpha=0.25, label=\"Laplace smoothing\")\n", "\n", " plt.xlabel(\"Rank\")\n", " plt.ylabel(\"Probability\")\n", " plt.legend()\n", " plt.tight_layout()\n", " plt.savefig(\"img/rank_probs.png\")\n", " plt.close(\"all\")\n"]} {"path": "numpy_ml/plots/lm_plots.py", "content": ["# flake8: noqa\n", "import numpy as np\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets.samples_generator import make_blobs\n", "from sklearn.linear_model import LogisticRegression as LogisticRegression_sk\n", "from sklearn.datasets import make_regression\n", "from sklearn.metrics import zero_one_loss, r2_score\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import seaborn as sns\n", "\n", "# https://seaborn.pydata.org/generated/seaborn.set_context.html\n", "# https://seaborn.pydata.org/generated/seaborn.set_style.html\n", "sns.set_style(\"white\")\n", "sns.set_context(\"paper\", font_scale=0.5)\n", "\n", "\n", "from numpy_ml.linear_models import (\n", " RidgeRegression,\n", " LinearRegression,\n", " BayesianLinearRegressionKnownVariance,\n", " BayesianLinearRegressionUnknownVariance,\n", " LogisticRegression,\n", ")\n", "\n", "#######################################################################\n", "# Data Generators #\n", "#######################################################################\n", "\n", "\n", "def random_binary_tensor(shape, sparsity=0.5):\n", " X = (np.random.rand(*shape) >= (1 - sparsity)).astype(float)\n", " return X\n", "\n", "\n", "def random_regression_problem(n_ex, n_in, n_out, intercept=0, std=1, seed=0):\n", " X, y, coef = make_regression(\n", " n_samples=n_ex,\n", " n_features=n_in,\n", " n_targets=n_out,\n", " bias=intercept,\n", " noise=std,\n", " coef=True,\n", " random_state=seed,\n", " )\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.3, random_state=seed\n", " )\n", " return X_train, y_train, X_test, y_test, coef\n", "\n", "\n", "def random_classification_problem(n_ex, n_classes, n_in, seed=0):\n", " X, y = make_blobs(\n", " n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=seed\n", " )\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.3, random_state=seed\n", " )\n", " return X_train, y_train, X_test, y_test\n", "\n", "\n", "#######################################################################\n", "# Plots #\n", "#######################################################################\n", "\n", "\n", "def plot_logistic():\n", " np.random.seed(12345)\n", "\n", " fig, axes = plt.subplots(4, 4)\n", " for i, ax in enumerate(axes.flatten()):\n", " n_in = 1\n", " n_ex = 150\n", " X_train, y_train, X_test, y_test = random_classification_problem(\n", " n_ex, n_classes=2, n_in=n_in, seed=i\n", " )\n", " LR = LogisticRegression(penalty=\"l2\", gamma=0.2, fit_intercept=True)\n", " LR.fit(X_train, y_train, lr=0.1, tol=1e-7, max_iter=1e7)\n", " y_pred = (LR.predict(X_test) >= 0.5) * 1.0\n", " loss = zero_one_loss(y_test, y_pred) * 100.0\n", "\n", " LR_sk = LogisticRegression_sk(\n", " penalty=\"l2\", tol=0.0001, C=0.8, fit_intercept=True, random_state=i\n", " )\n", " LR_sk.fit(X_train, y_train)\n", " y_pred_sk = (LR_sk.predict(X_test) >= 0.5) * 1.0\n", " loss_sk = zero_one_loss(y_test, y_pred_sk) * 100.0\n", "\n", " xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))\n", " xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))\n", " X_plot = np.linspace(xmin, xmax, 100)\n", " y_plot = LR.predict(X_plot)\n", " y_plot_sk = LR_sk.predict_proba(X_plot.reshape(-1, 1))[:, 1]\n", "\n", " ax.scatter(X_test[y_pred == 0], y_test[y_pred == 0], alpha=0.5)\n", " ax.scatter(X_test[y_pred == 1], y_test[y_pred == 1], alpha=0.5)\n", " ax.plot(X_plot, y_plot, label=\"mine\", alpha=0.75)\n", " ax.plot(X_plot, y_plot_sk, label=\"sklearn\", alpha=0.75)\n", " ax.legend()\n", " ax.set_title(\"Loss mine: {:.2f} Loss sklearn: {:.2f}\".format(loss, loss_sk))\n", "\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"plot_logistic.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "def plot_bayes():\n", " np.random.seed(12345)\n", " n_in = 1\n", " n_out = 1\n", " n_ex = 20\n", " std = 15\n", " intercept = 10\n", " X_train, y_train, X_test, y_test, coefs = random_regression_problem(\n", " n_ex, n_in, n_out, intercept=intercept, std=std, seed=0\n", " )\n", "\n", " # add some outliers\n", " x1, x2 = X_train[0] + 0.5, X_train[6] - 0.3\n", " y1 = np.dot(x1, coefs) + intercept + 25\n", " y2 = np.dot(x2, coefs) + intercept - 31\n", " X_train = np.vstack([X_train, np.array([x1, x2])])\n", " y_train = np.hstack([y_train, [y1[0], y2[0]]])\n", "\n", " LR = LinearRegression(fit_intercept=True)\n", " LR.fit(X_train, y_train)\n", " y_pred = LR.predict(X_test)\n", " loss = np.mean((y_test - y_pred) ** 2)\n", "\n", " ridge = RidgeRegression(alpha=1, fit_intercept=True)\n", " ridge.fit(X_train, y_train)\n", " y_pred = ridge.predict(X_test)\n", " loss_ridge = np.mean((y_test - y_pred) ** 2)\n", "\n", " LR_var = BayesianLinearRegressionKnownVariance(\n", " mu=np.c_[intercept, coefs][0], sigma=np.sqrt(std), V=None, fit_intercept=True,\n", " )\n", " LR_var.fit(X_train, y_train)\n", " y_pred_var = LR_var.predict(X_test)\n", " loss_var = np.mean((y_test - y_pred_var) ** 2)\n", "\n", " LR_novar = BayesianLinearRegressionUnknownVariance(\n", " alpha=1, beta=2, mu=np.c_[intercept, coefs][0], V=None, fit_intercept=True\n", " )\n", " LR_novar.fit(X_train, y_train)\n", " y_pred_novar = LR_novar.predict(X_test)\n", " loss_novar = np.mean((y_test - y_pred_novar) ** 2)\n", "\n", " xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))\n", " xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))\n", " X_plot = np.linspace(xmin, xmax, 100)\n", " y_plot = LR.predict(X_plot)\n", " y_plot_ridge = ridge.predict(X_plot)\n", " y_plot_var = LR_var.predict(X_plot)\n", " y_plot_novar = LR_novar.predict(X_plot)\n", "\n", " y_true = [np.dot(x, coefs) + intercept for x in X_plot]\n", " fig, axes = plt.subplots(1, 4)\n", "\n", " axes = axes.flatten()\n", " axes[0].scatter(X_test, y_test)\n", " axes[0].plot(X_plot, y_plot, label=\"MLE\")\n", " axes[0].plot(X_plot, y_true, label=\"True fn\")\n", " axes[0].set_title(\"Linear Regression\\nMLE Test MSE: {:.2f}\".format(loss))\n", " axes[0].legend()\n", " # axes[0].fill_between(X_plot, y_plot - error, y_plot + error)\n", "\n", " axes[1].scatter(X_test, y_test)\n", " axes[1].plot(X_plot, y_plot_ridge, label=\"MLE\")\n", " axes[1].plot(X_plot, y_true, label=\"True fn\")\n", " axes[1].set_title(\n", " \"Ridge Regression (alpha=1)\\nMLE Test MSE: {:.2f}\".format(loss_ridge)\n", " )\n", " axes[1].legend()\n", "\n", " axes[2].plot(X_plot, y_plot_var, label=\"MAP\")\n", " mu, cov = LR_var.posterior[\"b\"].mean, LR_var.posterior[\"b\"].cov\n", " for k in range(200):\n", " b_samp = np.random.multivariate_normal(mu, cov)\n", " y_samp = [np.dot(x, b_samp[1]) + b_samp[0] for x in X_plot]\n", " axes[2].plot(X_plot, y_samp, alpha=0.05)\n", " axes[2].scatter(X_test, y_test)\n", " axes[2].plot(X_plot, y_true, label=\"True fn\")\n", " axes[2].legend()\n", " axes[2].set_title(\n", " \"Bayesian Regression (known variance)\\nMAP Test MSE: {:.2f}\".format(loss_var)\n", " )\n", "\n", " axes[3].plot(X_plot, y_plot_novar, label=\"MAP\")\n", " mu = LR_novar.posterior[\"b | sigma**2\"].mean\n", " cov = LR_novar.posterior[\"b | sigma**2\"].cov\n", " for k in range(200):\n", " b_samp = np.random.multivariate_normal(mu, cov)\n", " y_samp = [np.dot(x, b_samp[1]) + b_samp[0] for x in X_plot]\n", " axes[3].plot(X_plot, y_samp, alpha=0.05)\n", " axes[3].scatter(X_test, y_test)\n", " axes[3].plot(X_plot, y_true, label=\"True fn\")\n", " axes[3].legend()\n", " axes[3].set_title(\n", " \"Bayesian Regression (unknown variance)\\nMAP Test MSE: {:.2f}\".format(\n", " loss_novar\n", " )\n", " )\n", "\n", " for ax in axes:\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", "\n", " fig.set_size_inches(7.5, 1.875)\n", " plt.savefig(\"plot_bayes.png\", dpi=300)\n", " plt.close(\"all\")\n", "\n", "\n", "def plot_regression():\n", " np.random.seed(12345)\n", "\n", " fig, axes = plt.subplots(4, 4)\n", " for i, ax in enumerate(axes.flatten()):\n", " n_in = 1\n", " n_out = 1\n", " n_ex = 50\n", " std = np.random.randint(0, 100)\n", " intercept = np.random.rand() * np.random.randint(-300, 300)\n", " X_train, y_train, X_test, y_test, coefs = random_regression_problem(\n", " n_ex, n_in, n_out, intercept=intercept, std=std, seed=i\n", " )\n", "\n", " LR = LinearRegression(fit_intercept=True)\n", " LR.fit(X_train, y_train)\n", " y_pred = LR.predict(X_test)\n", " loss = np.mean((y_test - y_pred) ** 2)\n", " r2 = r2_score(y_test, y_pred)\n", "\n", " LR_var = BayesianLinearRegressionKnownVariance(\n", " mu=np.c_[intercept, coefs][0],\n", " sigma=np.sqrt(std),\n", " V=None,\n", " fit_intercept=True,\n", " )\n", " LR_var.fit(X_train, y_train)\n", " y_pred_var = LR_var.predict(X_test)\n", " loss_var = np.mean((y_test - y_pred_var) ** 2)\n", " r2_var = r2_score(y_test, y_pred_var)\n", "\n", " LR_novar = BayesianLinearRegressionUnknownVariance(\n", " alpha=1, beta=2, mu=np.c_[intercept, coefs][0], V=None, fit_intercept=True,\n", " )\n", " LR_novar.fit(X_train, y_train)\n", " y_pred_novar = LR_novar.predict(X_test)\n", " loss_novar = np.mean((y_test - y_pred_novar) ** 2)\n", " r2_novar = r2_score(y_test, y_pred_novar)\n", "\n", " xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))\n", " xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))\n", " X_plot = np.linspace(xmin, xmax, 100)\n", " y_plot = LR.predict(X_plot)\n", " y_plot_var = LR_var.predict(X_plot)\n", " y_plot_novar = LR_novar.predict(X_plot)\n", "\n", " ax.scatter(X_test, y_test, marker=\"x\", alpha=0.5)\n", " ax.plot(X_plot, y_plot, label=\"linear regression\", alpha=0.5)\n", " ax.plot(X_plot, y_plot_var, label=\"Bayes (w var)\", alpha=0.5)\n", " ax.plot(X_plot, y_plot_novar, label=\"Bayes (no var)\", alpha=0.5)\n", " ax.legend()\n", " ax.set_title(\n", " \"MSE\\nLR: {:.2f} Bayes (w var): {:.2f}\\nBayes (no var): {:.2f}\".format(\n", " loss, loss_var, loss_novar\n", " )\n", " )\n", "\n", " ax.xaxis.set_ticklabels([])\n", " ax.yaxis.set_ticklabels([])\n", "\n", " plt.tight_layout()\n", " plt.savefig(\"plot_regression.png\", dpi=300)\n", " plt.close(\"all\")\n"]}