Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Running

App Files Files Community

NTT123 commited on Mar 15, 2022

Commit

d1a84ee

•

1 Parent(s): df1ad02

add fast cpp wavegru

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BUILD +44 -0
WORKSPACE +154 -0
app.py +12 -1
inference.py +7 -6
packages.txt +2 -1
sparse_matmul/BUILD +22 -0
sparse_matmul/compute/BUILD +88 -0
sparse_matmul/compute/ar_inputs.h +37 -0
sparse_matmul/compute/gru_gates.h +214 -0
sparse_matmul/compute/gru_gates_arm.h +288 -0
sparse_matmul/compute/gru_gates_avx_fixed.h +348 -0
sparse_matmul/compute/gru_gates_generic.h +97 -0
sparse_matmul/compute/gru_gates_test.cc +164 -0
sparse_matmul/compute/kernels_arm.h +0 -0
sparse_matmul/compute/kernels_avx.h +601 -0
sparse_matmul/compute/kernels_generic.h +273 -0
sparse_matmul/compute/matmul.h +199 -0
sparse_matmul/compute/matmul_fixed_avx2.cc +235 -0
sparse_matmul/compute/matmul_fixed_avx2.h +49 -0
sparse_matmul/compute/matmul_generic.cc +122 -0
sparse_matmul/compute/matmul_generic.h +41 -0
sparse_matmul/compute/thread_bounds.cc +106 -0
sparse_matmul/compute/thread_bounds.h +74 -0
sparse_matmul/layers/BUILD +146 -0
sparse_matmul/layers/csr_blocksparse_matrix.h +835 -0
sparse_matmul/layers/csrblocksparse_test.cc +977 -0
sparse_matmul/layers/errno_mapping.cc +195 -0
sparse_matmul/layers/errno_mapping.h +29 -0
sparse_matmul/layers/masked_sparse_matrix.h +206 -0
sparse_matmul/layers/read_array_ifstream.h +66 -0
sparse_matmul/layers/sparse_linear_layer.h +365 -0
sparse_matmul/layers/sparse_linear_layer_test.cc +187 -0
sparse_matmul/layers/status_macros.h +34 -0
sparse_matmul/layers/testdata/768_512_95_4x4_QRhat_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_What_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_mask.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_mask.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_finelogit_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_finelogit_mask.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_finelogit_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_fineproj_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_fineproj_mask.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_fineproj_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_wavernn_gru_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_wavernn_gru_mask.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_wavernn_gru_weights.raw.gz +3 -0

BUILD ADDED Viewed

	@@ -0,0 +1,44 @@

+# [internal] load cc_fuzz_target.bzl
+# [internal] load cc_proto_library.bzl
+# [internal] load android_cc_test:def.bzl
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+package(default_visibility = [":__subpackages__"])
+licenses(["notice"])
+# To run all cc_tests in this directory:
+# bazel test //:all
+# [internal] Command to run dsp_util_android_test.
+# [internal] Command to run lyra_integration_android_test.
+exports_files(
+    srcs = [
+        "wavegru_mod.cc",
+    ],
+)
+pybind_extension(
+    name = "wavegru_mod",  # This name is not actually created!
+    srcs = ["wavegru_mod.cc"],
+    deps = [
+        "//sparse_matmul",
+    ],
+)
+py_library(
+    name = "wavegru_mod",
+    data = [":wavegru_mod.so"],
+)
+py_binary(
+    name = "wavegru",
+    srcs = ["wavegru.py"],
+    deps = [
+        ":wavegru_mod"
+    ],
+)

WORKSPACE ADDED Viewed

	@@ -0,0 +1,154 @@

+########################
+# Platform Independent #
+########################
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+# GoogleTest/GoogleMock framework.
+git_repository(
+    name = "com_google_googletest",
+    remote = "https://github.com/google/googletest.git",
+    tag = "release-1.10.0",
+)
+# Google benchmark.
+http_archive(
+    name = "com_github_google_benchmark",
+    urls = ["https://github.com/google/benchmark/archive/bf585a2789e30585b4e3ce6baf11ef2750b54677.zip"],  # 2020-11-26T11:14:03Z
+    strip_prefix = "benchmark-bf585a2789e30585b4e3ce6baf11ef2750b54677",
+    sha256 = "2a778d821997df7d8646c9c59b8edb9a573a6e04c534c01892a40aa524a7b68c",
+)
+# proto_library, cc_proto_library, and java_proto_library rules implicitly
+# depend on @com_google_protobuf for protoc and proto runtimes.
+# This statement defines the @com_google_protobuf repo.
+git_repository(
+    name = "com_google_protobuf",
+    remote = "https://github.com/protocolbuffers/protobuf.git",
+    tag = "v3.15.4",
+)
+load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
+protobuf_deps()
+# Google Abseil Libs
+git_repository(
+    name = "com_google_absl",
+    remote = "https://github.com/abseil/abseil-cpp.git",
+    branch = "lts_2020_09_23",
+)
+# Filesystem
+# The new_* prefix is used because it is not a bazel project and there is
+# no BUILD file in that repo.
+FILESYSTEM_BUILD = """
+cc_library(
+  name = "filesystem",
+  hdrs = glob(["include/ghc/*"]),
+  visibility = ["//visibility:public"],
+)
+"""
+new_git_repository(
+    name = "gulrak_filesystem",
+    remote = "https://github.com/gulrak/filesystem.git",
+    tag = "v1.3.6",
+    build_file_content = FILESYSTEM_BUILD
+)
+# Audio DSP
+git_repository(
+    name = "com_google_audio_dsp",
+    remote = "https://github.com/google/multichannel-audio-tools.git",
+    # There are no tags for this repo, we are synced to bleeding edge.
+    branch = "master",
+    repo_mapping = {
+        "@com_github_glog_glog" : "@com_google_glog"
+    }
+)
+http_archive(
+  name = "pybind11_bazel",
+  strip_prefix = "pybind11_bazel-72cbbf1fbc830e487e3012862b7b720001b70672",
+  urls = ["https://github.com/pybind/pybind11_bazel/archive/72cbbf1fbc830e487e3012862b7b720001b70672.zip"],
+)
+# We still require the pybind library.
+http_archive(
+  name = "pybind11",
+  build_file = "@pybind11_bazel//:pybind11.BUILD",
+  strip_prefix = "pybind11-2.9.0",
+  urls = ["https://github.com/pybind/pybind11/archive/v2.9.0.tar.gz"],
+)
+load("@pybind11_bazel//:python_configure.bzl", "python_configure")
+python_configure(name = "local_config_python")
+# Transitive dependencies of Audio DSP.
+http_archive(
+    name = "eigen_archive",
+    build_file = "eigen.BUILD",
+    sha256 = "f3d69ac773ecaf3602cb940040390d4e71a501bb145ca9e01ce5464cf6d4eb68",
+    strip_prefix = "eigen-eigen-049af2f56331",
+    urls = [
+        "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+        "https://bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+    ],
+)
+http_archive(
+    name = "fft2d",
+    build_file = "fft2d.BUILD",
+    sha256 = "ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9",
+    urls = [
+        "http://www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz",
+    ],
+)
+# Google logging
+git_repository(
+    name = "com_google_glog",
+    remote = "https://github.com/google/glog.git",
+    branch = "master"
+)
+# Dependency for glog
+git_repository(
+    name = "com_github_gflags_gflags",
+    remote = "https://github.com/mchinen/gflags.git",
+    branch = "android_linking_fix"
+)
+# Bazel/build rules
+http_archive(
+    name = "bazel_skylib",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+    ],
+    sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
+)
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+bazel_skylib_workspace()
+http_archive(
+    name = "rules_android",
+    sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
+    strip_prefix = "rules_android-0.1.1",
+    urls = ["https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip"],
+)
+# Google Maven Repository
+GMAVEN_TAG = "20180625-1"
+http_archive(
+    name = "gmaven_rules",
+    strip_prefix = "gmaven_rules-%s" % GMAVEN_TAG,
+    url = "https://github.com/bazelbuild/gmaven_rules/archive/%s.tar.gz" % GMAVEN_TAG,
+)
+load("@gmaven_rules//:gmaven.bzl", "gmaven_rules")
+gmaven_rules()

app.py CHANGED Viewed

@@ -1,6 +1,14 @@
 import gradio as gr
 from inference import load_tacotron_model, load_wavegru_net, text_to_mel, mel_to_wav
 alphabet, tacotron_net, tacotron_config = load_tacotron_model(
     "./alphabet.txt", "./tacotron.toml", "./pretrained_model_ljs_500k.ckpt"
@@ -11,10 +19,13 @@ wavegru_config, wavegru_net = load_wavegru_net(
     "./wavegru.yaml", "./wavegru_vocoder_tpu_gta_preemphasis_pruning_v7_0040000.ckpt"
 )
 def speak(text):
     mel = text_to_mel(tacotron_net, text, alphabet, tacotron_config)
-    y = mel_to_wav(wavegru_net, mel, wavegru_config)
     return 24_000, y

 import gradio as gr
+import os
+## build wavegru-cpp
+os.system("go get github.com/bazelbuild/bazelisk")
+os.system("bazelisk build wavegru_mod -c opt --copt=-march=native")
 from inference import load_tacotron_model, load_wavegru_net, text_to_mel, mel_to_wav
+from wavegru_cpp import load_wavegru_cpp, extract_weight_mask
 alphabet, tacotron_net, tacotron_config = load_tacotron_model(
     "./alphabet.txt", "./tacotron.toml", "./pretrained_model_ljs_500k.ckpt"
     "./wavegru.yaml", "./wavegru_vocoder_tpu_gta_preemphasis_pruning_v7_0040000.ckpt"
 )
+wave_cpp_weight_mask = extract_weight_mask(wavegru_net)
+wavecpp = load_wavegru_cpp(wave_cpp_weight_mask)
 def speak(text):
     mel = text_to_mel(tacotron_net, text, alphabet, tacotron_config)
+    y = mel_to_wav(wavegru_net, wavecpp, mel, wavegru_config)
     return 24_000, y

inference.py CHANGED Viewed

@@ -56,10 +56,10 @@ def load_wavegru_net(config_file, model_file):
     return config, net
-wavegru_inference = pax.pure(lambda net, mel: net.inference(mel, no_gru=False))
-def mel_to_wav(net, mel, config):
     """convert mel to wav"""
     if len(mel.shape) == 2:
         mel = mel[None]
@@ -69,10 +69,11 @@ def mel_to_wav(net, mel, config):
         [(0, 0), (pad, pad), (0, 0)],
         constant_values=np.log(config["mel_min"]),
     )
-    x = wavegru_inference(net, mel)
-    x = jax.device_get(x)
-    wav = librosa.mu_expand(x - 127, mu=255)
     wav = librosa.effects.deemphasis(wav, coef=0.86)
     wav = wav * 2.0
     wav = wav / max(1.0, np.max(np.abs(wav)))

     return config, net
+wavegru_inference = pax.pure(lambda net, mel: net.inference(mel, no_gru=True))
+def mel_to_wav(net, netcpp, mel, config):
     """convert mel to wav"""
     if len(mel.shape) == 2:
         mel = mel[None]
         [(0, 0), (pad, pad), (0, 0)],
         constant_values=np.log(config["mel_min"]),
     )
+    ft = wavegru_inference(net, mel)
+    ft = jax.device_get(ft[0])
+    wav = netcpp.inference(ft, 1.0)
+    wav = np.array(wav)
+    wav = librosa.mu_expand(wav - 127, mu=255)
     wav = librosa.effects.deemphasis(wav, coef=0.86)
     wav = wav * 2.0
     wav = wav / max(1.0, np.max(np.abs(wav)))

packages.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- libsndfile1-dev


1	+ libsndfile1-dev
2	+ golang-go

sparse_matmul/BUILD ADDED Viewed

	@@ -0,0 +1,22 @@

+# [internal] load placeholder
+licenses(["notice"])
+cc_library(
+    name = "sparse_matmul",
+    hdrs = [
+        "sparse_matmul.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//sparse_matmul/compute:gru_gates",
+        "//sparse_matmul/layers:layer",
+        "//sparse_matmul/layers:matrix",
+        "//sparse_matmul/layers:utils",
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/os:coop_threads",
+        "//sparse_matmul/vector:cache_aligned_vector",
+    ],  # internal :sparse_matmul deps placeholder
+)

sparse_matmul/compute/BUILD ADDED Viewed

	@@ -0,0 +1,88 @@

+# Low-level computation code, including generic and architecture-specific
+# variants.
+licenses(["notice"])
+cc_library(
+    name = "gru_gates",
+    srcs = [
+        "ar_inputs.h",
+        "gru_gates_arm.h",
+        "gru_gates_avx_fixed.h",
+        "gru_gates_generic.h",
+    ],
+    hdrs = ["gru_gates.h"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":matmul",
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/vector:cache_aligned_vector",
+    ],
+)
+cc_library(
+    name = "kernels",
+    srcs = [
+        "kernels_arm.h",
+        "kernels_avx.h",
+    ],
+    hdrs = [
+        "kernels_generic.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:types",
+    ],
+)
+cc_library(
+    name = "matmul",
+    srcs = [
+        "matmul_fixed_avx2.cc",
+        "matmul_fixed_avx2.h",
+        "matmul_generic.cc",
+        "matmul_generic.h",
+    ],
+    hdrs = [
+        "matmul.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "//sparse_matmul/numerics:types",
+        "@com_google_absl//absl/time",
+    ],
+)
+cc_library(
+    name = "thread_bounds",
+    srcs = ["thread_bounds.cc"],
+    hdrs = ["thread_bounds.h"],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "@com_google_glog//:glog",
+    ],
+)
+cc_test(
+    name = "gru_gates_test",
+    size = "small",
+    srcs = [
+        "gru_gates_test.cc",
+    ],
+    deps = [
+        ":gru_gates",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)

sparse_matmul/compute/ar_inputs.h ADDED Viewed

	@@ -0,0 +1,37 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_AR_INPUTS_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_AR_INPUTS_H_
+namespace csrblocksparse {
+// Possible numbers of Autoregressive inputs.
+// TODO(b/188702959): Generalize to any non-negative integer value?
+enum class ARInputsMode {
+  // There are no autoregressive inputs. Inputs to the GRU gates are strictly
+  // from the gate-recurrent matmul and other unrelated inputs.
+  k0ARInputs,
+  // Two autoregressive inputs, such as coarse and fine for WaveRNN.
+  k2ARInputs,
+  // Three autoregressive inputs, such as prev coarse and fine plus current
+  // coarse for WaveRNN.
+  k3ARInputs,
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_AR_INPUTS_H_

sparse_matmul/compute/gru_gates.h ADDED Viewed

	@@ -0,0 +1,214 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_
+#include <cstdint>
+#include <vector>
+// IWYU pragma: begin_exports
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/compute/gru_gates_arm.h"
+#include "sparse_matmul/compute/gru_gates_avx_fixed.h"
+#include "sparse_matmul/compute/gru_gates_generic.h"
+#include "sparse_matmul/compute/matmul.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+// IWYU pragma: end_exports
+namespace csrblocksparse {
+// The master template is really a catch-all for the unimplemented cases to
+// run the generics.
+template <typename GRUStateType, typename InputType, typename SampleType = void>
+class GruGates : public MatmulBase {
+ public:
+  using SampleWeightType = float;
+  static constexpr int kSIMDWidth = kGenericSIMDWidth;
+  // Generic GRU function covers all uses for WaveRNN-like architectures and
+  // conditioning.
+  // Controlled by template parameters thus:
+  // - |kInputsMode| == |k0ARInputs|: There are no autoregressive inputs so
+  //   |ar_sample0|, |ar_sample1|, |ar_sample2|, |ar_01_weights|,
+  //   |ar_2_weights| are ignored.
+  // - |kInputsMode| == |k2ARInputs|: |ar_sample0|, |ar_sample1| are multiplied
+  //   by |ar_01_weights| and added to the (conditioning) input.
+  // - |kInputsMode| == |k3ARInputs|: |ar_sample2| is multiplied by
+  //   |ar_2_weights| and added to the other two |ar_inputs| (and added to the
+  //   conditioning input).
+  // - If |kSplitGates| is true: The |*gru_recurrent_other_ptr| is secondary
+  //   recurrent input that must be added to |*gru_recurrent_ptr|.
+  // - |num_replicas| determines the number of duplicates of the output to be
+  //   written, separated by |replica_stride|.
+  // - |start|, |end| are |rows| in [0, |state_size|] to be processed by this
+  //   thread.
+  //
+  // Previous state is read from |*gru_state_ptr| and the new state is written
+  // to *(|gru_state_ptr| + i * |replica_stride| for i in [0, |num_replicas|)).
+  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+            bool kSplitGates = false>
+  void GruWithARInput(int start, int end, int state_size,
+                      const InputType* gru_recurrent_ptr,
+                      const InputType* input_ptr, GRUStateType* gru_state_ptr,
+                      const SampleType* ar_sample0 = nullptr,
+                      const SampleType* ar_sample1 = nullptr,
+                      const SampleWeightType* ar_01_weights = nullptr,
+                      int num_replicas = 1, int replica_stride = 0,
+                      const SampleType* ar_sample2 = nullptr,
+                      const SampleWeightType* ar_2_weights = nullptr,
+                      const InputType* gru_recurrent_other_ptr = nullptr) {
+    CHECK_EQ(num_replicas, 1) << "Generic code should always have 1 replica";
+    GoThroughGates<GRUStateType, InputType, SampleWeightType, SampleType,
+                   kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_ptr, gru_recurrent_other_ptr,
+        input_ptr, gru_state_ptr, ar_2_weights, state_size, ar_sample0,
+        ar_sample1, ar_sample2);
+  }
+  // No AR inputs, no split gates, no batching, no replicated outputs.
+  // TODO(b/188702959): Redirect conditioning GRU here, removing code from
+  // gru_layer.h.
+  // Copy to specializations.
+  void PlainGru(int start, int end, int state_size,
+                const InputType* gru_recurrent_ptr, const InputType* input_ptr,
+                GRUStateType* gru_state_ptr) {
+    GruWithARInput<ARInputsMode::k0ARInputs>(
+        start, end, state_size, gru_recurrent_ptr, input_ptr, gru_state_ptr);
+  }
+};
+#if defined __ARM_NEON || defined __aarch64__
+// Partial specialization for float.
+template <>
+class GruGates<float, float, float> : public MatmulBase {
+ public:
+  static constexpr int kSIMDWidth = kNeonSIMDWidth;
+  // Generic GRU function covers all uses for WaveRNN-like architectures and
+  // conditioning.
+  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+            bool kSplitGates = false>
+  void GruWithARInput(int start, int end, int state_size,
+                      const float* gru_recurrent_data, const float* input_data,
+                      float* gru_state_data, const float* ar_sample0 = nullptr,
+                      const float* ar_sample1 = nullptr,
+                      const float* ar_01_weights = nullptr,
+                      int num_replicas = 1, int replica_stride = 0,
+                      const float* ar_sample2 = nullptr,
+                      const float* ar_2_weights = nullptr,
+                      const float* gru_recurrent_other_data = nullptr) {
+    DCHECK_EQ(num_replicas, 1) << "ARM code should always have 1 replica";
+    GoThroughGatesFloat<kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_data, gru_recurrent_other_data,
+        input_data, gru_state_data, ar_2_weights, state_size, ar_sample0,
+        ar_sample1, ar_sample2);
+  }
+};
+#endif  // defined __ARM_NEON || defined __aarch64__
+// Partial specialization for fixed types. The sample weights are always float
+// whatever the fixed type of the other weights.
+template <int kGRUStateBits, int kInputBits, int kSampleBits>
+class GruGates<fixed16<kGRUStateBits>, fixed32<kInputBits>,
+               fixed16<kSampleBits>> : public MatmulBase {
+ public:
+#if defined __ARM_NEON || defined __aarch64__
+  static constexpr int kSIMDWidth = kNeonSIMDWidth;
+#elif defined __AVX2__
+  static constexpr int kSIMDWidth = kAVX2SIMDWidth * 2;
+#else   // Generic case.
+  static constexpr int kSIMDWidth = kGenericSIMDWidth;
+#endif  // __ARM_NEON || defined __aarch64__ / __AVX2__
+  using GRUStateType = fixed16<kGRUStateBits>;
+  using InputType = fixed32<kInputBits>;
+  using SampleType = fixed16<kSampleBits>;
+  using SampleWeightType = float;
+  static constexpr int kInputMantissaBits = InputType::kMantissaBits;
+  static constexpr int kSampleMantissaBits = SampleType::kMantissaBits;
+  static constexpr int kStateMantissaBits = GRUStateType::kMantissaBits;
+  // Generic GRU function covers all uses for WaveRNN-like architectures and
+  // conditioning.
+  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+            bool kSplitGates = false>
+  void GruWithARInput(int start, int end, int state_size,
+                      const InputType* gru_recurrent_data,
+                      const InputType* input_data, GRUStateType* gru_state_data,
+                      const SampleType* ar_sample0 = nullptr,
+                      const SampleType* ar_sample1 = nullptr,
+                      const SampleWeightType* ar_01_weights = nullptr,
+                      int num_replicas = 1, int replica_stride = 0,
+                      const SampleType* ar_sample2 = nullptr,
+                      const SampleWeightType* ar_2_weights = nullptr,
+                      const InputType* gru_recurrent_other_data = nullptr) {
+#if defined __ARM_NEON || defined __aarch64__ || defined __AVX2__
+    const int32_t* gru_recurrent_ptr =
+        reinterpret_cast<const int32_t*>(gru_recurrent_data);
+    const int32_t* gru_recurrent_other_ptr =
+        reinterpret_cast<const int32_t*>(gru_recurrent_other_data);
+    const int32_t* input_ptr = reinterpret_cast<const int32_t*>(input_data);
+    int16_t* gru_state_ptr = reinterpret_cast<int16_t*>(gru_state_data);
+#if defined __AVX2__
+    // The samples are fixed16, but we scale them up here and convert to float
+    // so that the product with the QR weights is always on the same scale as
+    // InputType, so we don't have to do any more scaling inside.
+    const float sample_factor = static_cast<float>(1 << kInputMantissaBits);
+#else
+    const float sample_factor = 1.0f;
+#endif
+    // AR sample 0 and 1 are packed into a pair because the QR weights are
+    // formatted with the weights interleaved for sample 0 and 1.
+    std::pair<float, float> ar_sample01;
+    float ar_sample2_float = 0.0f;
+    if (kInputsMode == ARInputsMode::k2ARInputs ||
+        kInputsMode == ARInputsMode::k3ARInputs) {
+      ar_sample01 = {static_cast<float>(*ar_sample0) * sample_factor,
+                     static_cast<float>(*ar_sample1) * sample_factor};
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        ar_sample2_float = static_cast<float>(*ar_sample2) * sample_factor;
+      }
+    }
+#if defined __AVX2__
+    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
+    GruGatesAVXFixed<kInputMantissaBits, kStateMantissaBits, kInputsMode,
+                     kSplitGates>(
+        start, end, state_size, gru_recurrent_ptr, input_ptr, &ar_sample01,
+        ar_01_weights, num_replicas, replica_stride, &ar_sample2_float,
+        ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+#else   // ARM.
+    DCHECK_EQ(num_replicas, 1) << "ARM code should always have 1 replica";
+    GoThroughGatesFixed<GRUStateType, InputType, kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_ptr, gru_recurrent_other_ptr,
+        input_ptr, gru_state_ptr, ar_2_weights, state_size, &ar_sample01,
+        &ar_sample2_float);
+#endif  // __AVX2__ / ARM.
+#else   // Generic case.
+    CHECK_EQ(num_replicas, 1) << "Generic code should always have 1 replica";
+    GoThroughGates<GRUStateType, InputType, SampleWeightType, SampleType,
+                   kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_data, gru_recurrent_other_data,
+        input_data, gru_state_data, ar_2_weights, state_size, ar_sample0,
+        ar_sample1, ar_sample2);
+#endif  // __ARM_NEON || defined __aarch64__ / __AVX2__
+  }
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_

sparse_matmul/compute/gru_gates_arm.h ADDED Viewed

	@@ -0,0 +1,288 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_ARM_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_ARM_H_
+#if defined __ARM_NEON || defined __aarch64__
+#include <arm_neon.h>
+#endif
+#include <cstdint>
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+namespace csrblocksparse {
+static constexpr int kNeonSIMDWidth = 4;
+// ------ Scalar calculation --------
+// See "Efficient Neural Audio Synthesis" for a description of the calculation.
+// https://arxiv.org/abs/1802.08435
+//
+// NOTE:
+// |sample| = (|coarse_at_sminus1|, |fine_at_sminus1|,
+//             |coarse_at_sminus1|, |fine_at_sminus1|)
+// |w_sample| = (|coarse_at_s|, |coarse_at_s|, |coarse_at_s|, |coarse_at_s|)
+//
+// CHEATSHEET:
+// vld1q_f32 = load 4 32-bit floats
+// vmulq_f32(a, b) : return a * b;
+// vaddq_f32(a, b) : return a + b;
+// vmlaq_f32(c, a, b) : return c + a * b;
+// vpaddq_f32(a, b) : return (a0 + a1, a2 + a3, b0 + b1, b2 + b3)
+// vsubq_f32(a, b) : return a - b;
+// vst1q_f32 = store 4 32-bit floats
+#if defined __ARM_NEON || defined __aarch64__
+#if !defined __aarch64__
+// Backport of vpaddq_f32 to ARM32.
+inline float32x4_t vpaddq_f32(float32x4_t a, float32x4_t b) {
+  float32x2_t a10 = vget_low_f32(a);
+  float32x2_t a32 = vget_high_f32(a);
+  float32x2_t b10 = vget_low_f32(b);
+  float32x2_t b32 = vget_high_f32(b);
+  return vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32));
+}
+#endif
+template <ARInputsMode kInputsMode, bool SplitGates>
+void GoThroughGatesFloat(int start, int end, const float* qr_ptr,
+                         const float* gru_gates_ptr,
+                         const float* gru_gates_other_ptr,
+                         const float* conditioning_ptr, float* gru_h_ptr,
+                         const float* w_hat, int proj_size,
+                         const float* coarse_at_sminus1,
+                         const float* fine_at_sminus1,
+                         const float* coarse_at_s) {
+  // Increment all the pointers to save on pointer arithmetic in the loop.
+  conditioning_ptr += start;
+  gru_h_ptr += start;
+  gru_gates_ptr += start;
+  if (SplitGates) {
+    DCHECK_NE(gru_gates_other_ptr, nullptr);
+    gru_gates_other_ptr += start;
+  }
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    DCHECK_NE(qr_ptr, nullptr);
+    qr_ptr += 2 * start;
+    DCHECK_NE(coarse_at_sminus1, nullptr);
+    DCHECK_NE(fine_at_sminus1, nullptr);
+    if (kInputsMode == ARInputsMode::k3ARInputs) {
+      DCHECK_NE(w_hat, nullptr);
+      DCHECK_NE(coarse_at_s, nullptr);
+      w_hat += start;
+    }
+  }
+  for (int i = start; i < end; i += kNeonSIMDWidth) {
+    float32x4_t reset = vld1q_f32(gru_gates_ptr);
+    float32x4_t update = vld1q_f32(gru_gates_ptr + proj_size);
+    float32x4_t cell = vld1q_f32(gru_gates_ptr + 2 * proj_size);
+    float32x4_t qr_cell;
+    if (SplitGates) {
+      reset = vaddq_f32(reset, vld1q_f32(gru_gates_other_ptr));
+      update = vaddq_f32(update, vld1q_f32(gru_gates_other_ptr + proj_size));
+      cell = vaddq_f32(cell, vld1q_f32(gru_gates_other_ptr + 2 * proj_size));
+    }
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      // Setup the sample vector.
+      float32x4_t sample = vdupq_n_f32(*coarse_at_sminus1);
+      sample = vsetq_lane_f32(*fine_at_sminus1, sample, 1);
+      sample = vsetq_lane_f32(*fine_at_sminus1, sample, 3);
+      // All auto types are float32x4_t, auto used to fit statements on one line
+      // for readability. Do two rows of QR at once.
+      auto qr_reset_0 = vmulq_f32(vld1q_f32(qr_ptr), sample);
+      auto qr_reset_1 = vmulq_f32(vld1q_f32(qr_ptr + 4), sample);
+      auto qr_reset = vpaddq_f32(qr_reset_0, qr_reset_1);
+      auto qr_update_0 = vmulq_f32(vld1q_f32(qr_ptr + 2 * proj_size), sample);
+      auto qr_update_1 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 + 2 * proj_size), sample);
+      auto qr_update = vpaddq_f32(qr_update_0, qr_update_1);
+      auto qr_cell_0 = vmulq_f32(vld1q_f32(qr_ptr + 4 * proj_size), sample);
+      auto qr_cell_1 = vmulq_f32(vld1q_f32(qr_ptr + 4 + 4 * proj_size), sample);
+      qr_cell = vpaddq_f32(qr_cell_0, qr_cell_1);
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        float32x4_t w_sample = vdupq_n_f32(*coarse_at_s);
+        qr_reset = vmlaq_f32(qr_reset, vld1q_f32(w_hat), w_sample);
+        qr_update =
+            vmlaq_f32(qr_update, vld1q_f32(w_hat + proj_size), w_sample);
+        qr_cell =
+            vmlaq_f32(qr_cell, vld1q_f32(w_hat + 2 * proj_size), w_sample);
+      }
+      reset = vaddq_f32(reset, qr_reset);
+      update = vaddq_f32(update, qr_update);
+    }
+    auto reset_conditioning = vld1q_f32(conditioning_ptr);
+    auto update_conditioning = vld1q_f32(conditioning_ptr + proj_size);
+    auto cell_conditioning = vld1q_f32(conditioning_ptr + 2 * proj_size);
+    reset = fast_sigmoid(vaddq_f32(reset, reset_conditioning));
+    update = fast_sigmoid(vaddq_f32(update, update_conditioning));
+    if (kInputsMode == ARInputsMode::k0ARInputs) {
+      cell = vmulq_f32(reset, cell);
+    } else {
+      cell = vmlaq_f32(qr_cell, reset, cell);
+    }
+    auto hbar = fast_tanh(vaddq_f32(cell, cell_conditioning));
+    auto prev_h = vld1q_f32(gru_h_ptr);
+    auto diff = vsubq_f32(prev_h, hbar);
+    auto new_h = vmlaq_f32(hbar, diff, update);
+    vst1q_f32(gru_h_ptr, new_h);
+    // Increment all the pointers.
+    conditioning_ptr += kNeonSIMDWidth;
+    gru_h_ptr += kNeonSIMDWidth;
+    gru_gates_ptr += kNeonSIMDWidth;
+    if (SplitGates) gru_gates_other_ptr += kNeonSIMDWidth;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      qr_ptr += 2 * kNeonSIMDWidth;
+      if (kInputsMode == ARInputsMode::k3ARInputs) w_hat += kNeonSIMDWidth;
+    }
+  }
+}
+// This version should only be used if all of the 32-bit fixed point
+// representations have the same number of mantissa bits.
+// |ar_at_sminus1| packs sample 0 and 1 into a pair because the QR weights are
+// formatted with the weights interleaved for sample 0 and 1. The two samples
+// represent coarse and fine for WaveRNN.
+template <typename GRUStateType, typename GRUMatMulOutType,
+          ARInputsMode kInputsMode, bool SplitGates>
+void GoThroughGatesFixed(int start, int end, const float* qr_ptr,
+                         const int32_t* gru_gates_ptr,
+                         const int32_t* gru_gates_other_ptr,
+                         const int32_t* conditioning_ptr, int16_t* gru_h_ptr,
+                         const float* w_hat, int proj_size,
+                         const std::pair<float, float>* ar_at_sminus1,
+                         const float* coarse_at_s) {
+  // Increment all the pointers to save on pointer arithmetic in the loop.
+  conditioning_ptr += start;
+  gru_h_ptr += start;
+  gru_gates_ptr += start;
+  if (SplitGates) {
+    DCHECK_NE(gru_gates_other_ptr, nullptr);
+    gru_gates_other_ptr += start;
+  }
+  float32x4_t sample01;
+  float32x4_t w_sample;
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    DCHECK_NE(qr_ptr, nullptr);
+    qr_ptr += 2 * start;
+    DCHECK_NE(ar_at_sminus1, nullptr);
+    sample01 = vdupq_n_f32(ar_at_sminus1->first);
+    sample01 = vsetq_lane_f32(ar_at_sminus1->second, sample01, 1);
+    sample01 = vsetq_lane_f32(ar_at_sminus1->second, sample01, 3);
+    if (kInputsMode == ARInputsMode::k3ARInputs) {
+      DCHECK_NE(w_hat, nullptr);
+      DCHECK_NE(coarse_at_s, nullptr);
+      w_hat += start;
+      w_sample = vdupq_n_f32(*coarse_at_s);
+    }
+  }
+  for (int i = start; i < end; i += kNeonSIMDWidth) {
+    auto reset = vld1q_s32(gru_gates_ptr);
+    auto update = vld1q_s32(gru_gates_ptr + proj_size);
+    // vcvtq_n_f32_s32 = convert 32-bit fixed point to fp32
+    auto cell_int = vld1q_s32(gru_gates_ptr + 2 * proj_size);
+    if (SplitGates) {
+      reset = vaddq_s32(reset, vld1q_s32(gru_gates_other_ptr));
+      update = vaddq_s32(update, vld1q_s32(gru_gates_other_ptr + proj_size));
+      cell_int =
+          vaddq_s32(cell_int, vld1q_s32(gru_gates_other_ptr + 2 * proj_size));
+    }
+    float32x4_t cell =
+        vcvtq_n_f32_s32(cell_int, GRUMatMulOutType::kMantissaBits);
+    float32x4_t qr_cell;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      // Do two rows of QR at once.
+      float32x4_t qr_reset_0 = vmulq_f32(vld1q_f32(qr_ptr), sample01);
+      float32x4_t qr_reset_1 = vmulq_f32(vld1q_f32(qr_ptr + 4), sample01);
+      float32x4_t qr_reset = vpaddq_f32(qr_reset_0, qr_reset_1);
+      float32x4_t qr_update_0 =
+          vmulq_f32(vld1q_f32(qr_ptr + 2 * proj_size), sample01);
+      float32x4_t qr_update_1 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 + 2 * proj_size), sample01);
+      float32x4_t qr_update = vpaddq_f32(qr_update_0, qr_update_1);
+      float32x4_t qr_cell_0 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 * proj_size), sample01);
+      float32x4_t qr_cell_1 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 + 4 * proj_size), sample01);
+      qr_cell = vpaddq_f32(qr_cell_0, qr_cell_1);
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        float32x4_t w_sample = vdupq_n_f32(*coarse_at_s);
+        qr_reset = vmlaq_f32(qr_reset, vld1q_f32(w_hat), w_sample);
+        qr_update =
+            vmlaq_f32(qr_update, vld1q_f32(w_hat + proj_size), w_sample);
+        qr_cell =
+            vmlaq_f32(qr_cell, vld1q_f32(w_hat + 2 * proj_size), w_sample);
+      }
+      reset = vaddq_s32(
+          reset, vcvtq_n_s32_f32(qr_reset, GRUMatMulOutType::kMantissaBits));
+      update = vaddq_s32(
+          update, vcvtq_n_s32_f32(qr_update, GRUMatMulOutType::kMantissaBits));
+    }
+    auto reset_conditioning = vld1q_s32(conditioning_ptr);
+    auto update_conditioning = vld1q_s32(conditioning_ptr + proj_size);
+    float32x4_t cell_conditioning =
+        vcvtq_n_f32_s32(vld1q_s32(conditioning_ptr + 2 * proj_size),
+                        GRUMatMulOutType::kMantissaBits);
+    float32x4_t reset_f32 = fast_sigmoid<GRUMatMulOutType::kExponentBits>(
+        vaddq_s32(reset, reset_conditioning));
+    float32x4_t update_f32 = fast_sigmoid<GRUMatMulOutType::kExponentBits>(
+        vaddq_s32(update, update_conditioning));
+    if (kInputsMode == ARInputsMode::k0ARInputs) {
+      cell = vmulq_f32(reset_f32, cell);
+    } else {
+      cell = vmlaq_f32(qr_cell, reset_f32, cell);
+    }
+    float32x4_t hbar = fast_tanh(vaddq_f32(cell, cell_conditioning));
+    float32x4_t prev_h = vcvtq_n_f32_s32(vmovl_s16(vld1_s16(gru_h_ptr)),
+                                         GRUStateType::kMantissaBits);
+    float32x4_t diff = vsubq_f32(prev_h, hbar);
+    float32x4_t new_h = vmlaq_f32(hbar, diff, update_f32);
+    // vcvtq_n_s32_f32 = convert fp32 to signed 32-bit fixed point
+    // vqrshrn_n_s32 = saturating, rounding, narrowing right shift - used to
+    // convert a 32-bit fixed point value to a 16-bit fixed point value
+    vst1_s16(gru_h_ptr,
+             vqrshrn_n_s32(
+                 vcvtq_n_s32_f32(new_h, GRUStateType::kMantissaBits + 16), 16));
+    // Increment all the pointers.
+    conditioning_ptr += kNeonSIMDWidth;
+    gru_h_ptr += kNeonSIMDWidth;
+    gru_gates_ptr += kNeonSIMDWidth;
+    if (SplitGates) gru_gates_other_ptr += kNeonSIMDWidth;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      qr_ptr += 2 * kNeonSIMDWidth;
+      if (kInputsMode == ARInputsMode::k3ARInputs) w_hat += kNeonSIMDWidth;
+    }
+  }
+}
+#endif  // defined __ARM_NEON || defined __aarch64__
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_ARM_H_

sparse_matmul/compute/gru_gates_avx_fixed.h ADDED Viewed

	@@ -0,0 +1,348 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_AVX_FIXED_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_AVX_FIXED_H_
+#include <cstdint>
+#if defined __AVX2__
+#include <immintrin.h>
+#endif
+#include <vector>
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+namespace csrblocksparse {
+#if defined __AVX2__
+constexpr int kAVX2SIMDWidth = 8;
+// Loads 8x fixed32 from |ptr0| and adds to |input|.
+// If |kTwoInputs|, also loads from |ptr1| and adds that as well.
+// Returns the 2 or 3-way sum.
+template <bool kTwoInputs>
+inline __m256i LoadAndAddFixed32(const int32_t* ptr0, const int32_t* ptr1,
+                                 const __m256i& input) {
+  __m256i data0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr0));
+  if (kTwoInputs) {
+    __m256i data1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr1));
+    data0 = _mm256_add_epi32(data0, data1);
+  }
+  return _mm256_add_epi32(data0, input);
+}
+// Loads 8x fixed32 from ptr0.
+// If |kTwoInputs|, also loads from |ptr1| and adds.
+// Multiplies the loaded values by the factor and adds to |input|, which also
+// is converted to float.
+// Returns the sum.
+template <bool kTwoInputs>
+inline __m256 LoadMultiplyAddToFloat(const int32_t* ptr0, const int32_t* ptr1,
+                                     const __m256& float_factor,
+                                     const __m256& input) {
+  __m256i data0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr0));
+  if (kTwoInputs) {
+    __m256i data1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr1));
+    data0 = _mm256_add_epi32(data0, data1);
+  }
+  __m256 float_result = _mm256_cvtepi32_ps(data0);
+  float_result = _mm256_mul_ps(float_result, float_factor);
+  return _mm256_add_ps(float_result, input);
+}
+// Loads 16x float in 2x 8x registers from |ptr0_1| and multiplies by
+// |input_pairs|, likewise formatted as 8x floats, alternating between the two
+// AR inputs and sums each pair of results, making 8x float results.
+// If |kThreeInputs|, also loads 8x float from |ptr2| and multiplies by
+// |third_input|, which must be formatted as 8x float. The second product is
+// added to the previous result.
+// Returns the sum added to |accumulator|.
+template <bool kThreeInputs>
+inline __m256 MultiplyAddFloat(const __m256& input_pairs,
+                               const __m256& third_input, const float* ptr0_1,
+                               const float* ptr2, const __m256& accumulator) {
+  __m256 data_pair0 = _mm256_load_ps(ptr0_1);
+  __m256 data_pair1 = _mm256_load_ps(ptr0_1 + 8);
+  data_pair0 = _mm256_mul_ps(data_pair0, input_pairs);
+  data_pair1 = _mm256_mul_ps(data_pair1, input_pairs);
+  data_pair0 = _mm256_hadd_ps(data_pair0, data_pair1);
+  // Swap the middle 2 64 bit pairs to correct the hadd result.
+  data_pair0 = _mm256_permute4x64_pd((__m256d)data_pair0, 0xd8);
+  if (kThreeInputs) {
+    // Load 256 bits (8 x float) of data, then multiply-accumulate.
+    data_pair1 = _mm256_load_ps(ptr2);
+    data_pair1 = _mm256_mul_ps(data_pair1, third_input);
+    data_pair0 = _mm256_add_ps(data_pair0, data_pair1);
+  }
+  // Add conditioning.
+  return _mm256_add_ps(data_pair0, accumulator);
+}
+// Processes the tanh and the final combination, returns the new GRU state.
+template <int kInputMantissaBits, int kStateMantissaBits, bool kSplitGates>
+inline __m256i GRUComputeState(const __m256& cell0, const __m256& cell1,
+                               const __m256& reset0, const __m256& reset1,
+                               const __m256& update0, const __m256& update1,
+                               const int32_t* gate_ptr,
+                               const int32_t* gate_other_ptr,
+                               const void* gru_h_ptr) {
+  // Multiply the cell gru output and the reset.
+  __m256 float_gru0 = LoadMultiplyAddToFloat<kSplitGates>(
+      gate_ptr, gate_other_ptr, reset0, cell0);
+  __m256 float_gru1 = LoadMultiplyAddToFloat<kSplitGates>(
+      gate_ptr + kAVX2SIMDWidth, gate_other_ptr + kAVX2SIMDWidth, reset1,
+      cell1);
+  // Compute tanh on the result.
+  __m256 hbar0, hbar1;
+  float_tanh_float<kInputMantissaBits, TM_ORDER4_FLOAT>(float_gru0, float_gru1,
+                                                        hbar0, hbar1);
+  // Load the 16-bit previous gru state and update.
+  __m256i gru = _mm256_load_si256(reinterpret_cast<__m256i const*>(gru_h_ptr));
+  __m256 state_factor =
+      _mm256_set1_ps(1.0f / (static_cast<float>(1 << kStateMantissaBits)));
+  float_gru0 =
+      _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(gru)));
+  float_gru1 = _mm256_cvtepi32_ps(
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(gru, 1)));
+  float_gru0 = _mm256_mul_ps(float_gru0, state_factor);
+  float_gru1 = _mm256_mul_ps(float_gru1, state_factor);
+  float_gru0 = _mm256_sub_ps(float_gru0, hbar0);
+  float_gru1 = _mm256_sub_ps(float_gru1, hbar1);
+  float_gru0 = _mm256_mul_ps(float_gru0, update0);
+  float_gru1 = _mm256_mul_ps(float_gru1, update1);
+  state_factor = _mm256_set1_ps(static_cast<float>(1 << kStateMantissaBits));
+  float_gru0 = _mm256_add_ps(float_gru0, hbar0);
+  float_gru1 = _mm256_add_ps(float_gru1, hbar1);
+  float_gru0 = _mm256_mul_ps(float_gru0, state_factor);
+  float_gru1 = _mm256_mul_ps(float_gru1, state_factor);
+  return PackFloatsToFixed16(float_gru0, float_gru1);
+}
+// According to |kInputsMode|, processes 0, 2 or 3 autoregressive inputs and
+// combines with |input| and |gates*|.
+// With 2 AR inputs, loads 8x pairs of float from |pair_weights| and multiplies
+// by |paired_ar|, likewise formatted as 8x float, but scaled such that the
+// product with pair_weights is on the same scale as |*input| and |*gates0|,
+// and sums each pair result, making 8x float results.
+// If 3 AR inputs, also loads 8x float from |third_weights| and multiplies by
+// |third_ar|, which must be formatted as 8x scaled floats. The second product
+// is added to the previous result.
+// Inputs, 8x fixed32 are loaded from |input|, and added to the total.
+// Finally 8x fixed32 from |gates0| (and |gates1| if |kTwoGates|) are added as
+// well.
+// Returns the total sum as a float, but on the scale of |*input|.
+template <bool kTwoGates, ARInputsMode kInputsMode>
+inline __m256 GruInput32ToFloat(const __m256& paired_ar,
+                                 const __m256& third_ar,
+                                 const float* pair_weights,
+                                 const float* third_weights,
+                                 const int32_t* gates0, const int32_t* gates1,
+                                 const int32_t* input) {
+  __m256i data32 = _mm256_load_si256(reinterpret_cast<__m256i const*>(input));
+  data32 = LoadAndAddFixed32<kTwoGates>(gates0, gates1, data32);
+  __m256 float_data = _mm256_cvtepi32_ps(data32);
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    float_data = MultiplyAddFloat<kInputsMode == ARInputsMode::k3ARInputs>(
+        paired_ar, third_ar, pair_weights, third_weights, float_data);
+  }
+  return float_data;
+}
+// Generic GRU gates function controlled by template parameters thus:
+// - |kInputBits|: the mantissa bits in |*input_ptr|, |*gru_recurrent_ptr|.
+// - |kStateBits|: the mantissa_bits in |*gru_state_ptr|.
+// - |kInputsMode == |k0ARInputs|: There are no autoregressive inputs so
+//   |ar_sample, |ar_sample1|, |ar_sample2|, |ar_01_weights|, |ar_2_weights| are
+//   ignored.
+// - |kInputsMode| == |k2ARInputs|: |ar_sample0|, |ar_sample1| are multiplied by
+//   |ar_01_weights| and added to the (conditioning) input.
+// - |kInputsMode| == |k3ARInputs|: |ar_sample2| is multiplied by |ar_2_weights|
+//   and added to the other two AR inputs (and added to the conditioning input).
+// - |kReplicas| determines the number of duplicates of the output to be
+//   written, separated by |replica_stride|. If zero, then the number of
+//   replicas is variable and taken from the |replicas| argument.
+// - If |kSplitGates| is true: The |*gru_recurrent_other_ptr| is secondary
+//   recurrent input that must be added to |*gru_recurrent_ptr|.
+// - |start|, |end| are |rows| in [0, |state_size|] to be processed by this
+//   thread.
+//
+// Previous state is read from |*gru_state_ptr| and the new state is written to
+// *(|gru_state_ptr| + i * |replica_stride| for i in [0, |kReplicas|]).
+template <int kInputBits, int kStateBits,
+          ARInputsMode kInputsMode = ARInputsMode::k0ARInputs,
+          int kReplicas = 1, bool kSplitGates = false>
+inline void GruGatesTemplate(
+    int start, int end, int state_size, int replicas, int replica_stride,
+    const int32_t* gru_recurrent_ptr, const int32_t* input_ptr,
+    const std::pair<float, float>* ar_sample01, const float* ar_01_weights,
+    const float* ar_sample2, const float* ar_2_weights,
+    const int32_t* gru_recurrent_other_ptr, int16_t* gru_state_ptr) {
+  constexpr int kQRIncrement = kAVX2SIMDWidth;
+  // Increment all the pointers to save on pointer arithmetic in the loop.
+  input_ptr += start;
+  gru_state_ptr += start;
+  gru_recurrent_ptr += start;
+  if (kSplitGates) gru_recurrent_other_ptr += start;
+  __m256 ar_2_inputs, ar_3rd_input;
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    ar_01_weights += 2 * start;
+    ar_2_inputs = _mm256_castsi256_ps(
+        _mm256_set1_epi64x(*reinterpret_cast<const int64_t*>(ar_sample01)));
+    if (kInputsMode == ARInputsMode::k3ARInputs) {
+      ar_2_weights += start;
+      ar_3rd_input = _mm256_set1_ps(*ar_sample2);
+    } else {
+      ar_3rd_input = {};
+    }
+  } else {
+    ar_2_inputs = {};
+    ar_3rd_input = {};
+  }
+  // The transcendentals handle 2x registers of data at once, so we have to do
+  // everything in duplicate.
+  for (int i = start; i < end; i += kQRIncrement * 2) {
+    // Load 8 pairs of fixed16s for each of reset, update and cell.
+    __m256 reset0 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input, ar_01_weights, ar_2_weights,
+        gru_recurrent_ptr, gru_recurrent_other_ptr, input_ptr);
+    __m256 reset1 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input, ar_01_weights + 2 * kQRIncrement,
+        ar_2_weights + kQRIncrement, gru_recurrent_ptr + kAVX2SIMDWidth,
+        gru_recurrent_other_ptr + kAVX2SIMDWidth, input_ptr + kAVX2SIMDWidth);
+    float_sigmoid_float<kInputBits>(reset0, reset1);
+    __m256 update0 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input, ar_01_weights + 2 * state_size,
+        ar_2_weights + state_size, gru_recurrent_ptr + state_size,
+        gru_recurrent_other_ptr + state_size, input_ptr + state_size);
+    __m256 update1 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input,
+        ar_01_weights + 2 * state_size + 2 * kQRIncrement,
+        ar_2_weights + state_size + kQRIncrement,
+        gru_recurrent_ptr + state_size + kAVX2SIMDWidth,
+        gru_recurrent_other_ptr + state_size + kAVX2SIMDWidth,
+        input_ptr + state_size + kAVX2SIMDWidth);
+    float_sigmoid_float<kInputBits>(update0, update1);
+    __m256 cell0 = _mm256_cvtepi32_ps(_mm256_load_si256(
+        reinterpret_cast<__m256i const*>(input_ptr + 2 * state_size)));
+    __m256 cell1 =
+        _mm256_cvtepi32_ps(_mm256_load_si256(reinterpret_cast<__m256i const*>(
+            input_ptr + 2 * state_size + kAVX2SIMDWidth)));
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      cell0 = MultiplyAddFloat<kInputsMode == ARInputsMode::k3ARInputs>(
+          ar_2_inputs, ar_3rd_input, ar_01_weights + 4 * state_size,
+          ar_2_weights + 2 * state_size, cell0);
+      cell1 = MultiplyAddFloat<kInputsMode == ARInputsMode::k3ARInputs>(
+          ar_2_inputs, ar_3rd_input,
+          ar_01_weights + 4 * state_size + 2 * kQRIncrement,
+          ar_2_weights + 2 * state_size + kQRIncrement, cell1);
+    }
+    __m256i gru_state = GRUComputeState<kInputBits, kStateBits, kSplitGates>(
+        cell0, cell1, reset0, reset1, update0, update1,
+        gru_recurrent_ptr + 2 * state_size,
+        gru_recurrent_other_ptr + 2 * state_size, gru_state_ptr);
+    if (kReplicas > 0) {
+      // With |kReplicas| a template parameter, the compiler will unroll the
+      // loop.
+      for (int j = 0; j < kReplicas; ++j) {
+        _mm256_store_si256(
+            reinterpret_cast<__m256i*>(gru_state_ptr + j * replica_stride),
+            gru_state);
+      }
+    } else {
+      // This loop will not unroll as replicas is variable.
+      for (int j = 0; j < replicas; ++j) {
+        _mm256_store_si256(
+            reinterpret_cast<__m256i*>(gru_state_ptr + j * replica_stride),
+            gru_state);
+      }
+    }
+    // Increment all the pointers.
+    input_ptr += 2 * kAVX2SIMDWidth;
+    gru_state_ptr += 2 * kAVX2SIMDWidth;
+    gru_recurrent_ptr += 2 * kAVX2SIMDWidth;
+    if (kSplitGates) gru_recurrent_other_ptr += 2 * kAVX2SIMDWidth;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      ar_01_weights += 4 * kQRIncrement;
+      if (kInputsMode == ARInputsMode::k3ARInputs)
+        ar_2_weights += 2 * kQRIncrement;
+    }
+  }
+}
+// Dispatches calls to the GruGatesTemplate function above converting the
+// replicas variable argument to a template parameter to allow the compiler to
+// unroll the write loop.
+// |ar_sample01| packs sample 0 and 1 into a pair because the QR weights are
+// formatted with the weights interleaved for sample 0 and 1. The two samples
+// represent coarse and fine for WaveRNN.
+template <int kInputBits, int kStateBits,
+          ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+          bool kSplitGates = false>
+inline void GruGatesAVXFixed(
+    int start, int end, int state_size, const int32_t* gru_recurrent_ptr,
+    const int32_t* input_ptr, const std::pair<float, float>* ar_sample01,
+    const float* ar_01_weights, int num_replicas, int replica_stride,
+    const float* ar_sample2, const float* ar_2_weights,
+    const int32_t* gru_recurrent_other_ptr, int16_t* gru_state_ptr) {
+  // Convert the number of replicas from a variable to a template parameter
+  // with a switch. This enables the compiler to unroll the loop for
+  // the write, making it faster for common numbers of threads.
+  switch (num_replicas) {
+    case 1:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/1,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    case 2:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/2,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    case 4:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/4,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    case 6:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/6,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    default:
+      // Zero |kReplicas| tells the function to use the |num_replicas| variable.
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/0,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+  }
+}
+#endif  // __AVX2__
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_AVX_FIXED_H_

sparse_matmul/compute/gru_gates_generic.h ADDED Viewed

	@@ -0,0 +1,97 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_GENERIC_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_GENERIC_H_
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+namespace csrblocksparse {
+constexpr int kGenericSIMDWidth = 4;
+// TODO(b/188702959): Rename arguments to match gru_gates.h.
+template <typename GRUStateType, typename GRUMatMulOutType, typename QR_W_Type,
+          typename SampleType, ARInputsMode kInputsMode,
+          bool SplitGates = false>
+void GoThroughGates(int start, int end, const QR_W_Type* qr_ptr,
+                    const GRUMatMulOutType* gru_gates_ptr,
+                    const GRUMatMulOutType* gru_gates_other_ptr,
+                    const GRUMatMulOutType* conditioning_ptr,
+                    GRUStateType* gru_h_ptr, const QR_W_Type* w_hat,
+                    int proj_size, const SampleType* coarse_at_sminus1,
+                    const SampleType* fine_at_sminus1,
+                    const SampleType* coarse_at_s = nullptr) {
+  float qr_cell = 0.0f, reset, update, cell;
+  for (int i = start; i < end; ++i) {
+    if (kInputsMode == ARInputsMode::k0ARInputs) {
+      reset = static_cast<float>(gru_gates_ptr[i]);
+      update = static_cast<float>(gru_gates_ptr[proj_size + i]);
+    } else {
+      float qr_c_reset = static_cast<float>(qr_ptr[2 * i + 0]);
+      float qr_f_reset = static_cast<float>(qr_ptr[2 * i + 1]);
+      float qr_c_update = static_cast<float>(qr_ptr[2 * proj_size + 2 * i + 0]);
+      float qr_f_update = static_cast<float>(qr_ptr[2 * proj_size + 2 * i + 1]);
+      float qr_c_cell = static_cast<float>(qr_ptr[4 * proj_size + 2 * i + 0]);
+      float qr_f_cell = static_cast<float>(qr_ptr[4 * proj_size + 2 * i + 1]);
+      float w_hat_i_reset = 0.0f;
+      float w_hat_i_update = 0.0f;
+      float w_hat_i_cell = 0.0f;
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        w_hat_i_reset = static_cast<float>(w_hat[i]);
+        w_hat_i_update = static_cast<float>(w_hat[proj_size + i]);
+        w_hat_i_cell = static_cast<float>(w_hat[2 * proj_size + i]);
+      }
+      float coarse = static_cast<float>(coarse_at_sminus1[0]);
+      float fine = static_cast<float>(fine_at_sminus1[0]);
+      reset = qr_c_reset * coarse + qr_f_reset * fine;
+      update = qr_c_update * coarse + qr_f_update * fine;
+      qr_cell = qr_c_cell * coarse + qr_f_cell * fine;
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        float coarse = static_cast<float>(coarse_at_s[0]);
+        reset += w_hat_i_reset * coarse;
+        update += w_hat_i_update * coarse;
+        qr_cell += w_hat_i_cell * coarse;
+      }
+      reset += static_cast<float>(gru_gates_ptr[i]);
+      update += static_cast<float>(gru_gates_ptr[proj_size + i]);
+    }
+    cell = static_cast<float>(gru_gates_ptr[2 * proj_size + i]);
+    if (SplitGates) {
+      reset += static_cast<float>(gru_gates_other_ptr[i]);
+      update += static_cast<float>(gru_gates_other_ptr[proj_size + i]);
+      cell += static_cast<float>(gru_gates_other_ptr[2 * proj_size + i]);
+    }
+    float reset_conditioning = static_cast<float>(conditioning_ptr[i]);
+    float update_conditioning =
+        static_cast<float>(conditioning_ptr[proj_size + i]);
+    float cell_conditioning =
+        static_cast<float>(conditioning_ptr[2 * proj_size + i]);
+    reset = fast_sigmoid(reset + reset_conditioning);
+    update = fast_sigmoid(update + update_conditioning);
+    float hbar = fast_tanh(qr_cell + reset * cell + cell_conditioning);
+    int h_index = i;
+    float prev_h = static_cast<float>(gru_h_ptr[h_index]);
+    float diff = prev_h - hbar;
+    float new_h = hbar + diff * update;
+    gru_h_ptr[h_index] = static_cast<GRUStateType>(new_h);
+  }
+}
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_GENERIC_H_

sparse_matmul/compute/gru_gates_test.cc ADDED Viewed

	@@ -0,0 +1,164 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/gru_gates.h"
+#include <cstdint>
+#include <cstring>
+#include <numeric>
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+namespace {
+using csrblocksparse::ARInputsMode;
+template <typename GRUStateType, typename InputType, typename SampleType = void,
+          csrblocksparse::ARInputsMode kInputsMode, bool kSplitGates>
+csrblocksparse::CacheAlignedVector<GRUStateType> TestGruGates() {
+  using SampleWeightType = float;
+  constexpr int kStateSize = 16;
+  csrblocksparse::CacheAlignedVector<SampleWeightType> qr(6 * kStateSize);
+  csrblocksparse::CacheAlignedVector<SampleWeightType> w(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<InputType> gru_gates(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<InputType> gru_other_gates(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<InputType> conditioning(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<GRUStateType> gru_h(kStateSize);
+  csrblocksparse::GruGates<GRUStateType, InputType, SampleType> gru_gates_impl;
+  const SampleType kCoarseAtSMinus1(0.03f);
+  const SampleType kFineAtSMinus1(0.07f);
+  const SampleType kCoarseAtS(-0.02f);
+  qr.FillOnes();
+  w.FillOnes();
+  gru_gates.FillRandom();
+  gru_other_gates.FillRandom();
+  conditioning.FillRandom();
+  gru_h.FillZero();
+  gru_gates_impl.template GruWithARInput<kInputsMode, kSplitGates>(
+      /*start=*/0, /*end=*/kStateSize, kStateSize, gru_gates.data(),
+      conditioning.data(), gru_h.data(), &kCoarseAtSMinus1, &kFineAtSMinus1,
+      qr.data(),
+      /*num_replicas=*/1, /*replica_stride=*/0, &kCoarseAtS, w.data(),
+      gru_other_gates.data());
+  return gru_h;
+}
+TEST(GruGates, FloatWaveRNNCoarseMatchesGolden) {
+  // If the RNG in csrblocksparse::CacheAlignedVector changes, these numbers
+  // will also need to change.
+  const std::vector<float> kGoldenValues = {
+      0.0f, 0.0f, 0.0f,   0.0f, 1.0f, 0.746f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.970f, 0.0f, 0.0f, 1.0f,   0.0f, -0.993f};
+  csrblocksparse::CacheAlignedVector<float> gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/true>();
+  ASSERT_EQ(kGoldenValues.size(), gru_h.size());
+  for (int i = 0; i < gru_h.size(); ++i) {
+    EXPECT_NEAR(kGoldenValues[i], gru_h[i], 1e-3) << "i=" << i;
+  }
+}
+TEST(GruGates, FloatWaveRNNFineMatchesGolden) {
+  // If the RNG in csrblocksparse::CacheAlignedVector changes, these numbers
+  // will also need to change.
+  const std::vector<float> kGoldenValues = {
+      0.0f, 0.0f, 0.0f,   0.0f, 1.0f, 0.737f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.969f, 0.0f, 0.0f, 1.0f,   0.0f, -0.994f};
+  csrblocksparse::CacheAlignedVector<float> gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k3ARInputs,
+                   /*kSplitGates=*/true>();
+  ASSERT_EQ(kGoldenValues.size(), gru_h.size());
+  for (int i = 0; i < gru_h.size(); ++i) {
+    EXPECT_NEAR(kGoldenValues[i], gru_h[i], 1e-3) << "i=" << i;
+  }
+}
+TEST(GruGates, FloatTwoArInputsNonSplitGateMatchesGolden) {
+  // If the RNG in csrblocksparse::CacheAlignedVector changes, these numbers
+  // will also need to change.
+  const std::vector<float> kGoldenValues = {
+      0.0f, 0.0f, 0.0f,   0.0f, 1.0f, 0.714f, 0.0f, -0.002f,
+      0.0f, 0.0f, 0.970f, 0.0f, 0.0f, 1.0f,   0.0f, -0.965f};
+  csrblocksparse::CacheAlignedVector<float> gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/false>();
+  ASSERT_EQ(kGoldenValues.size(), gru_h.size());
+  for (int i = 0; i < gru_h.size(); ++i) {
+    EXPECT_NEAR(kGoldenValues[i], gru_h[i], 1e-3) << "i=" << i;
+  }
+}
+TEST(GruGates, FixedWaveRNNCoarseMatchesFloat) {
+  using GRUMatMulOutType = csrblocksparse::fixed32<11>;
+  using GRUStateType = csrblocksparse::fixed16<2>;
+  using SampleType = csrblocksparse::fixed16<0>;
+  csrblocksparse::CacheAlignedVector<float> float_gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/true>();
+  csrblocksparse::CacheAlignedVector<GRUStateType> fixed_gru_h =
+      TestGruGates<GRUStateType, GRUMatMulOutType, SampleType,
+                   ARInputsMode::k2ARInputs, /*kSplitGates=*/true>();
+  ASSERT_EQ(float_gru_h.size(), fixed_gru_h.size());
+  for (int i = 0; i < fixed_gru_h.size(); ++i) {
+    EXPECT_NEAR(float_gru_h[i], static_cast<float>(fixed_gru_h[i]), 1e-3)
+        << "i=" << i;
+  }
+}
+TEST(GruGates, FixedWaveRNNFineMatchesFloat) {
+  using GRUMatMulOutType = csrblocksparse::fixed32<11>;
+  using GRUStateType = csrblocksparse::fixed16<2>;
+  using SampleType = csrblocksparse::fixed16<0>;
+  csrblocksparse::CacheAlignedVector<float> float_gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k3ARInputs,
+                   /*kSplitGates=*/true>();
+  csrblocksparse::CacheAlignedVector<GRUStateType> fixed_gru_h =
+      TestGruGates<GRUStateType, GRUMatMulOutType, SampleType,
+                   ARInputsMode::k3ARInputs, /*kSplitGates=*/true>();
+  ASSERT_EQ(float_gru_h.size(), fixed_gru_h.size());
+  for (int i = 0; i < fixed_gru_h.size(); ++i) {
+    EXPECT_NEAR(float_gru_h[i], static_cast<float>(fixed_gru_h[i]), 1e-3)
+        << "i=" << i;
+  }
+}
+TEST(GruGates, FixedTwoArInputsNonSplitGateMatchesFloat) {
+  using GRUMatMulOutType = csrblocksparse::fixed32<11>;
+  using GRUStateType = csrblocksparse::fixed16<2>;
+  using SampleType = csrblocksparse::fixed16<0>;
+  csrblocksparse::CacheAlignedVector<float> float_gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/false>();
+  csrblocksparse::CacheAlignedVector<GRUStateType> fixed_gru_h =
+      TestGruGates<GRUStateType, GRUMatMulOutType, SampleType,
+                   ARInputsMode::k2ARInputs, /*kSplitGates=*/false>();
+  ASSERT_EQ(float_gru_h.size(), fixed_gru_h.size());
+  for (int i = 0; i < fixed_gru_h.size(); ++i) {
+    EXPECT_NEAR(float_gru_h[i], static_cast<float>(fixed_gru_h[i]), 1e-3)
+        << "i=" << i;
+  }
+}
+}  // namespace

sparse_matmul/compute/kernels_arm.h ADDED Viewed

The diff for this file is too large to render. See raw diff

sparse_matmul/compute/kernels_avx.h ADDED Viewed

	@@ -0,0 +1,601 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_AVX_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_AVX_H_
+#if defined __AVX__
+#include <immintrin.h>
+#include <algorithm>
+#include <type_traits>
+// TODO(b/188702959): Remove fast_transcendentals with GRU refactor.
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/float16_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+namespace csrblocksparse {
+namespace detail {
+template <typename WeightType, typename RhsType, typename OutType>
+struct IsAllowableFloatTypes
+    : std::integral_constant<bool, std::is_same<WeightType, float>::value &&
+                                       std::is_same<RhsType, float>::value &&
+                                       std::is_same<OutType, float>::value> {};
+#if defined __AVX2__
+// 16-bit inputs, 32-bit output exponent matches sum of input exponents
+// OR
+// 16-bit inputs, 16-bit output - will shift to match exponent
+template <typename WeightType, typename RhsType, typename OutType>
+struct IsAllowableFixedTypes
+    : std::integral_constant<bool, (IsFixed16Type<WeightType>::value &&
+                                    IsFixed16Type<RhsType>::value) &&
+                                       (IsFixed32Type<OutType>::value ||
+                                        IsFixed16Type<OutType>::value)> {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericKernel
+    : std::integral_constant<
+          bool,
+          !IsAllowableFloatTypes<WeightType, RhsType, OutType>::value &&
+              !IsAllowableFixedTypes<WeightType, RhsType, OutType>::value> {};
+template <typename Type>
+struct IsAddableFixedTypes
+    : std::integral_constant<bool, IsFixed32Type<Type>::value ||
+                                       IsFixed16Type<Type>::value> {};
+template <typename Type>
+struct ShouldEnableGenericAdd
+    : std::integral_constant<bool, !IsAddableFixedTypes<Type>::value> {};
+#else   // No AVX2.
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericKernel
+    : std::integral_constant<
+          bool, !IsAllowableFloatTypes<WeightType, RhsType, OutType>::value> {};
+template <typename Type>
+struct ShouldEnableGenericAdd : std::true_type {};
+#endif  // __AVX2__
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_4x4
+    : ShouldEnableGenericKernel<WeightType, RhsType, OutType> {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_4x4
+    : ShouldEnableGenericKernel<WeightType, RhsType, OutType> {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_1x1 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_1x1 : std::true_type {};
+// The computational routines do NO error checking for speed.  It is assumed
+// that this has been handled by CSRBlockSparseMatrix.
+// In-line function to extract results from a pair of registers and store in
+// memory. Note that the non-const references are registers, and are modified
+// by this function!
+inline void Extract4Results(bool relu, __m256& sum1, __m256& sum2,
+                            float** out_ptr) {
+  // Horizontally add the results. We have 2 registers, |sum1| and |sum2| that
+  // each contain 2 sets of 4 values that need to be added.
+  sum1 = _mm256_hadd_ps(sum1, sum2);
+  sum1 = _mm256_hadd_ps(sum1, sum1);
+  // Now |sum1| contains [|res0|, |res2|, |res0|, |res2|, |res1|, |res3|,
+  // |res1|, |res3|]
+  if (relu) {
+    sum1 = _mm256_max_ps(sum1, _mm256_setzero_ps());
+  }
+  // It is really hard in AVX to cross the 128 bit 'lanes' and this is the
+  // *only* way to do it.
+  // Get the top half of |sum1| in to bottom of |sum2|.
+  sum2 = _mm256_permute2f128_ps(sum1, sum1, 1);
+  // Interleave the values between the two registers.
+  sum1 = _mm256_unpacklo_ps(sum1, sum2);
+  // Save the lower 128 bits (4 floats).
+  __m128 result = _mm256_extractf128_ps(sum1, 0);
+  _mm_store_ps(*out_ptr, result);
+  *out_ptr += 4;
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a vector and b is vector. Weights are stored for this
+// routine by making each 4x4 block contiguous. Blocks are ordered in standard
+// row-major format. column indices are converted to deltas and then multiplied
+// by 2 to convert to bytes, so that the value can be used directly to offset
+// the pointer into the rhs vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<std::is_same<WeightType, float>::value &&
+                        std::is_same<RhsType, float>::value &&
+                        std::is_same<OutType, float>::value>::type
+SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // Broadcast the biases by 4 to undo the division by 4 in the input biases.
+    __m256 sum1 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                  _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    __m256 sum2 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                  _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      // Multiply this 4x4 block.
+      __m256 rhs =
+          _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptr));
+      __m256 weights1 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum1 = _mm256_add_ps(sum1, _mm256_mul_ps(weights1, rhs));
+      __m256 weights2 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum2 = _mm256_add_ps(sum2, _mm256_mul_ps(weights2, rhs));
+    }
+    Extract4Results(relu, sum1, sum2, &out_ptr);
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
+// broadcast. Weights are stored for this routine by making each 4x4 block
+// contiguous. Blocks are ordered in standard row-major format. column indices
+// are converted to deltas and then multiplied by 2 to convert to bytes, so
+// that the value can be used directly to offset the pointer into the rhs
+// vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<std::is_same<WeightType, float>::value &&
+                        std::is_same<RhsType, float>::value &&
+                        std::is_same<OutType, float>::value>::type
+SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // We will acumulate the results in 10 registers, |sum1_0| to |sum2_4|.
+    // Broadcast the biases by 4 to undo the division by 4 in the input biases.
+    __m256 sum1_0 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                    _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    __m256 sum2_0 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                    _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    __m256 sum1_1 = sum1_0;
+    __m256 sum2_1 = sum2_0;
+    __m256 sum1_2 = sum1_0;
+    __m256 sum2_2 = sum2_0;
+    __m256 sum1_3 = sum1_0;
+    __m256 sum2_3 = sum2_0;
+    __m256 sum1_4 = sum1_0;
+    __m256 sum2_4 = sum2_0;
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;
+      // Multiply this 4x4 block.
+      __m256 rhs =
+          _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[0]));
+      __m256 weights1 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum1_0 = _mm256_add_ps(sum1_0, _mm256_mul_ps(weights1, rhs));
+      __m256 weights2 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum2_0 = _mm256_add_ps(sum2_0, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[1]));
+      sum1_1 = _mm256_add_ps(sum1_1, _mm256_mul_ps(weights1, rhs));
+      sum2_1 = _mm256_add_ps(sum2_1, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[2]));
+      sum1_2 = _mm256_add_ps(sum1_2, _mm256_mul_ps(weights1, rhs));
+      sum2_2 = _mm256_add_ps(sum2_2, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[3]));
+      sum1_3 = _mm256_add_ps(sum1_3, _mm256_mul_ps(weights1, rhs));
+      sum2_3 = _mm256_add_ps(sum2_3, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[4]));
+      sum1_4 = _mm256_add_ps(sum1_4, _mm256_mul_ps(weights1, rhs));
+      sum2_4 = _mm256_add_ps(sum2_4, _mm256_mul_ps(weights2, rhs));
+    }
+    Extract4Results(relu, sum1_0, sum2_0, &out_ptrs[0]);
+    Extract4Results(relu, sum1_1, sum2_1, &out_ptrs[1]);
+    Extract4Results(relu, sum1_2, sum2_2, &out_ptrs[2]);
+    Extract4Results(relu, sum1_3, sum2_3, &out_ptrs[3]);
+    Extract4Results(relu, sum1_4, sum2_4, &out_ptrs[4]);
+  }
+}
+#ifdef __AVX2__
+// In-line function to finish the computation of the result as 4x int32 in
+// |sum|.
+inline void Compute4Results(bool relu, int kShiftAmount, __m256i& sum) {
+  // Horizontally add the results. We have 1 register that contains results
+  // [0 0 1 1 2 2 3 3], but hadd (and almost no other AVX instruction) will not
+  // cross lanes, so we end up with [0 1 0 1 2 3 2 3]
+  sum = _mm256_hadd_epi32(sum, sum);
+  // Permutes the middle two pairs to get the answers together.
+  sum = _mm256_permute4x64_epi64(sum, 0xd8);
+  if (kShiftAmount > 0) {
+    // Shift right with rounding to get the right number of mantissa bits.
+    __m256i rounding = _mm256_set1_epi32(1 << (kShiftAmount - 1));
+    sum = _mm256_add_epi32(sum, rounding);
+    sum = _mm256_srai_epi32(sum, kShiftAmount);
+  }
+  // Now |sum| contains [|res0|, |res1|, |res2|, |res3|, |res0|, |res1|,
+  // |res2|, |res3|]
+  if (relu) {
+    sum = _mm256_max_epi32(sum, _mm256_setzero_si256());
+  }
+}
+// In-line function to extract the 4x int32 results from |sum| to memory.
+// Non-const reference for |sum| as it is a register.
+inline void Extract4xint32(bool relu, int kShiftAmount, __m256i& sum,
+                           int32_t** out_ptr) {
+  Compute4Results(relu, kShiftAmount, sum);
+  // Save the lower 128 bits (4x int32).
+  __m128i result = _mm256_extractf128_si256(sum, 0);
+  _mm_store_si128(reinterpret_cast<__m128i*>(*out_ptr), result);
+  *out_ptr += 4;
+}
+// In-line function to extract the 4x int32 results from sum to 4x int16 in
+// memory.
+// Non-const reference for |sum| as it is a register.
+inline void Extract4xint16(bool relu, int kShiftAmount, __m256i& sum,
+                           int16_t** out_ptr) {
+  Compute4Results(relu, kShiftAmount, sum);
+  // Clip to 16 bit range (with saturation) and pack in the bottom 64 bits.
+  // Converts the lower 4x int32 in bottom 128 bits to 4x int16 in bottom 64
+  // bits, replicated in the next 64 bits.
+  sum = _mm256_packs_epi32(sum, sum);
+  // Save 4x int 16 from the bottom 64 bits.
+  *reinterpret_cast<int64_t*>(*out_ptr) = _mm256_extract_epi64(sum, 0);
+  *out_ptr += 4;
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a vector and b is vector. Weights are stored for this
+// routine by making each 4x4 block contiguous. Blocks are ordered in standard
+// row-major format. column indices are converted to deltas and then multiplied
+// by 2 to convert to bytes, so that the value can be used directly to offset
+// the pointer into the rhs vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in  SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    IsFixed16Type<WeightType>::value && IsFixed16Type<RhsType>::value &&
+    (IsFixed32Type<OutType>::value || IsFixed16Type<OutType>::value)>::type
+SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  constexpr int kShiftAmount =
+      TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+      OutType::kMantissaBits;
+  static_assert(kShiftAmount >= 0,
+                "Result must have fewer mantissa bits than product");
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // Load the biases duplicated into a 256 bit register [0 1 2 3 0 1 2 3].
+    __m128i bias = _mm_load_si128(reinterpret_cast<__m128i const*>(bias_ptr));
+    __m256i biases = _mm256_set_m128i(bias, bias);
+    bias_ptr += 4;
+    // Swap the top two pairs: [0 1 2 3 2 3 0 1]
+    // TODO(b/188702959): consider |_mm256_permutevar8x32|, and set the index
+    // register outside the row loop.
+    biases = _mm256_permute4x64_epi64(biases, 0xb4);
+    // Duplicate the low pairs in each lane: [0 0 1 1 2 2 3 3].
+    biases = _mm256_unpacklo_epi32(biases, biases);
+    // Double the results to make up for the division by 4.
+    // TODO(b/188702959): consider moving this to where the biases are computed.
+    __m256i sum = _mm256_add_epi32(biases, biases);
+    // TODO(b/188702959): People don't like the old-fashioned, close-to-the-
+    // metal notation of *|nnz_per_row|++, so measure the effect of putting the
+    // increment in the for loop.
+    int reduced_col_count = *nnz_per_row;
+    ++nnz_per_row;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      // Multiply this 4x4 block.
+      // Get the 4x int16 into the bottom of rhs_64.
+      __m128i rhs_64 =
+          _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptr));
+      // Load all 16 weights.
+      __m256i weights =
+          _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+      // Broadcast the rhs, pretending that each is a 64-bit unit:
+      // [0123 0123 0123 0123].
+      __m256i rhs = _mm256_broadcastq_epi64(rhs_64);
+      weights_ptr += 16;
+      // |_mm256_madd_epi16| does 16x16x16=16x32 bit multiply and horizontally
+      // adds adjacent pairs to make 8x32 bit results. Add these to the sum.
+      sum = _mm256_add_epi32(sum, _mm256_madd_epi16(weights, rhs));
+    }
+    static_assert(
+        IsFixed16Type<OutType>::value || IsFixed32Type<OutType>::value,
+        "AVX2 kernel only supports fixed16 and fixed32 types");
+    // The only significant difference between fixed16 and fixed32 is the size
+    // of the storage unit. The registers have to be repacked accordingly.
+    if (IsFixed32Type<OutType>::value) {
+      Extract4xint32(relu, kShiftAmount, sum,
+                     reinterpret_cast<int32_t**>(&out_ptr));
+    } else {
+      Extract4xint16(relu, kShiftAmount, sum,
+                     reinterpret_cast<int16_t**>(&out_ptr));
+    }
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
+// broadcast. Weights are stored for this routine by making each 4x4 block
+// contiguous. Blocks are ordered in standard row-major format. column indices
+// are converted to deltas and then multiplied by 2 to convert to bytes, so
+// that the value can be used directly to offset the pointer into the rhs
+// vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    IsFixed16Type<WeightType>::value && IsFixed16Type<RhsType>::value &&
+    (IsFixed32Type<OutType>::value || IsFixed16Type<OutType>::value)>::type
+SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  constexpr int kShiftAmount =
+      TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+      OutType::kMantissaBits;
+  static_assert(kShiftAmount >= 0,
+                "Result must have fewer mantissa bits than product");
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // We will acumulate the results in 5 registers, sum_0 to sum_4.
+    // Load the biases duplicated into a 256 bit register [0 1 2 3 0 1 2 3].
+    __m128i bias = _mm_load_si128(reinterpret_cast<__m128i const*>(bias_ptr));
+    __m256i biases = _mm256_set_m128i(bias, bias);
+    bias_ptr += 4;
+    // Swap the top two pairs: [0 1 2 3 2 3 0 1]
+    biases = _mm256_permute4x64_epi64(biases, 0xb4);
+    // Duplicate the low pairs in each lane: [0 0 1 1 2 2 3 3].
+    biases = _mm256_unpacklo_epi32(biases, biases);
+    // Double the results to make up for the division by 4.
+    __m256i sum_0 = _mm256_add_epi32(biases, biases);
+    __m256i sum_1 = sum_0;
+    __m256i sum_2 = sum_0;
+    __m256i sum_3 = sum_0;
+    __m256i sum_4 = sum_0;
+    int reduced_col_count = *nnz_per_row;
+    ++nnz_per_row;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;
+      // Multiply this 4x4 block.
+      // Get the 4x int16 into the bottom of |rhs_64|.
+      __m128i rhs_64 =
+          _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[0]));
+      // Load all 16 weights.
+      __m256i weights =
+          _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+      // Broadcast the rhs, pretending that each is a 64-bit unit:
+      // [0123 0123 0123 0123].
+      __m256i rhs = _mm256_broadcastq_epi64(rhs_64);
+      weights_ptr += 16;
+      // |_mm256_madd_epi16| does 16x16x16=16x32 bit multiply and horizontally
+      // adds adjacent pairs to make 8x32 bit results. Add these to the sum.
+      sum_0 = _mm256_add_epi32(sum_0, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[1]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_1 = _mm256_add_epi32(sum_1, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[2]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_2 = _mm256_add_epi32(sum_2, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[3]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_3 = _mm256_add_epi32(sum_3, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[4]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_4 = _mm256_add_epi32(sum_4, _mm256_madd_epi16(weights, rhs));
+    }
+    static_assert(
+        IsFixed16Type<OutType>::value || IsFixed32Type<OutType>::value,
+        "AVX2 kernel only supports fixed16 and fixed32 types");
+    // The only significant difference between fixed16 and fixed32 is the size
+    // of the storage unit. The registers have to be repacked accordingly.
+    if (IsFixed32Type<OutType>::value) {
+      Extract4xint32(relu, kShiftAmount, sum_0,
+                     reinterpret_cast<int32_t**>(&out_ptrs[0]));
+      Extract4xint32(relu, kShiftAmount, sum_1,
+                     reinterpret_cast<int32_t**>(&out_ptrs[1]));
+      Extract4xint32(relu, kShiftAmount, sum_2,
+                     reinterpret_cast<int32_t**>(&out_ptrs[2]));
+      Extract4xint32(relu, kShiftAmount, sum_3,
+                     reinterpret_cast<int32_t**>(&out_ptrs[3]));
+      Extract4xint32(relu, kShiftAmount, sum_4,
+                     reinterpret_cast<int32_t**>(&out_ptrs[4]));
+    } else {
+      Extract4xint16(relu, kShiftAmount, sum_0,
+                     reinterpret_cast<int16_t**>(&out_ptrs[0]));
+      Extract4xint16(relu, kShiftAmount, sum_1,
+                     reinterpret_cast<int16_t**>(&out_ptrs[1]));
+      Extract4xint16(relu, kShiftAmount, sum_2,
+                     reinterpret_cast<int16_t**>(&out_ptrs[2]));
+      Extract4xint16(relu, kShiftAmount, sum_3,
+                     reinterpret_cast<int16_t**>(&out_ptrs[3]));
+      Extract4xint16(relu, kShiftAmount, sum_4,
+                     reinterpret_cast<int16_t**>(&out_ptrs[4]));
+    }
+  }
+}
+// Processes one GRU gate input with sigmoid.
+template <int InputMantissaBits, int StateMantissaBits, bool SplitGates>
+inline __m256i GRUGateSigmoid(const void* gate_ptr, const void* gate_other_ptr,
+                              const __m256i& input,
+                              const int32_t* sigmoid_table) {
+  __m256i gate = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(gate_ptr));
+  if (SplitGates) {
+    __m256i other =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(gate_other_ptr));
+    gate = _mm256_add_epi32(gate, other);
+  }
+  gate = _mm256_add_epi32(gate, input);
+  // Compute sigmoids on reset and update.
+  return csrblocksparse::fixed32_sigmoid_fixed16<InputMantissaBits,
+                                                 StateMantissaBits>(
+      sigmoid_table, gate);
+}
+// Processes the tanh and the final combination, returning the new GRU state.
+template <int InputMantissaBits, int StateMantissaBits, bool SplitGates = false>
+inline __m256i GRUGateState(const __m256i& cell, const __m256i& reset,
+                            const __m256i& update,
+                            const __m256i& rounding_offset,
+                            const void* gate_ptr, const void* gate_other_ptr,
+                            const void* gru_h_ptr, const int32_t* tanh_table) {
+  // Multiply the cell GRU output and the reset. There is a slight danger of
+  // loss of precision here, so use 32x32=64 bit and shift back after.
+  __m256i gru = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(gate_ptr));
+  if (SplitGates) {
+    __m256i other_gru =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(gate_other_ptr));
+    gru = _mm256_add_epi32(gru, other_gru);
+  }
+  // This only computes the products of the low-order 32 bits of each pair.
+  __m256i gru_lo = _mm256_mul_epi32(gru, reset);
+  // Swap odd and even 32-bit units and do it again to get the high products.
+  gru = _mm256_shuffle_epi32(gru, 0xb1);
+  __m256i gru_hi = _mm256_mul_epi32(gru, _mm256_shuffle_epi32(reset, 0xb1));
+  // Now shift right to compensate for the multiply and re-interleave the
+  // 32-bit results.
+  // NOTE: There is no shift right arithmetic for 64 bit values until AVX512!
+  // Fortunately it doesn't matter, as the results are being truncated to 32
+  // bits and we aren't shifting right by more than 32 bits here.
+  gru_lo = _mm256_srli_epi64(gru_lo, StateMantissaBits);
+  // The upper results are shifted LEFT, so we can use blend to recombine in
+  // a single instruction.
+  gru_hi = _mm256_slli_epi64(gru_hi, 32 - StateMantissaBits);
+  // Recombine the 32 bit results from lo and hi, alternating.
+  gru = _mm256_blend_epi32(gru_lo, gru_hi, 0xaa);
+  gru = _mm256_add_epi32(cell, gru);
+  // Compute tanh on the result. Although this instantly discards a bunch of
+  // bits, there were only 7 surplus bits for the multiply, which isn't enough
+  // to do it as 16x16=32.
+  __m256i hbar =
+      csrblocksparse::fixed32_tanh_fixed16<InputMantissaBits,
+                                           StateMantissaBits>(tanh_table, gru);
+  // Load the 16-bit previous GRU state and sign-extend to 32 bits.
+  gru = _mm256_cvtepi16_epi32(
+      _mm_load_si128(reinterpret_cast<__m128i const*>(gru_h_ptr)));
+  gru = _mm256_sub_epi32(gru, hbar);
+  // Since |gru| is 16 bit sign-extended to 32, and |update| is the output of
+  // sigmoid, it is always contained within 16 bits and never negative, we can
+  // use |madd_epi16| to do 16x16=32 multiply with horizontal adding as the
+  // addend will always be zero, and this is twice as fast as full blown
+  // 32x32=32. The only possible problem is if the subtract above caused
+  // overflow.
+  gru = _mm256_madd_epi16(gru, update);
+  // Renormalize to fixed16. This time rounding is critical, as this is the
+  // output GRU state.
+  gru = _mm256_add_epi32(gru, rounding_offset);
+  gru = _mm256_srai_epi32(gru, StateMantissaBits);
+  return _mm256_add_epi32(gru, hbar);
+}
+template <typename Type>
+typename std::enable_if<IsFixed32Type<Type>::value>::type SumVectors(
+    int start, int end, const Type* add1, const Type* add2, Type* result) {
+  constexpr int kSIMDWidth = 8;
+  for (int i = start; i < end; i += kSIMDWidth) {
+    __m256i data1 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add1 + i));
+    __m256i data2 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add2 + i));
+    data1 = _mm256_add_epi32(data1, data2);
+    _mm256_store_si256(reinterpret_cast<__m256i*>(result + i), data1);
+  }
+}
+template <typename Type>
+typename std::enable_if<IsFixed16Type<Type>::value>::type SumVectors(
+    int start, int end, const Type* add1, const Type* add2, Type* result) {
+  constexpr int kSIMDWidth = 16;
+  for (int i = start; i < end; i += kSIMDWidth) {
+    __m256i data1 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add1 + i));
+    __m256i data2 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add2 + i));
+    data1 = _mm256_add_epi16(data1, data2);
+    _mm256_store_si256(reinterpret_cast<__m256i*>(result + i), data1);
+  }
+}
+#endif  // __AVX2__
+}  // namespace detail
+}  // namespace csrblocksparse
+#undef LABEL_COL_LOOP
+#undef LABEL_ROW_LOOP
+#undef LABEL_SKIP_COL_LOOP
+#undef LABEL_TOP_LOOP
+#endif  // __AVX__
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_AVX_H_

sparse_matmul/compute/kernels_generic.h ADDED Viewed

	@@ -0,0 +1,273 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_
+#include <algorithm>
+#include <type_traits>
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/float16_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+// Separate out the assembly kernels for readability. Eventually this will
+// become an ifdef switch on the architecture type.
+#if defined __aarch64__
+#include "sparse_matmul/compute/kernels_arm.h"
+#elif defined __AVX__
+#include "sparse_matmul/compute/kernels_avx.h"
+#else   // defined __AVX__
+// If there is no architecture-specific implementation, then always use generic.
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_4x4 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_4x4 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_1x1 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_1x1 : std::true_type {};
+template <typename Type>
+struct ShouldEnableGenericAdd : std::true_type {};
+#endif  // defined __arch64__
+namespace csrblocksparse {
+namespace detail {
+// The computational routines do NO error checking for speed.  It is assumed
+// that this has been handled by CSRBlockSparseMatrix.
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a vector and b is vector. Weights are stored for this
+// routine by making each 4x4 block contiguous. Blocks are ordered in standard
+// row-major format. column indices are converted to deltas and then multiplied
+// by 2 to convert to bytes, so that the value can be used directly to offset
+// the pointer into the rhs vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMV_4x4<WeightType, RhsType, OutType>::value>::type
+SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    float accumulators[4];
+    // Undo the divion by the happens for the assembly version.
+    for (int i = 0; i < 4; ++i)
+      accumulators[i] = 4.f * static_cast<float>(*bias_ptr++);
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      // Multiply this 4x4 block.
+      for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          accumulators[i] += static_cast<float>(*weights_ptr++) *
+                             static_cast<float>(rhs_ptr[j]);
+        }
+      }
+    }
+    for (int i = 0; i < 4; ++i)
+      *out_ptr++ = static_cast<OutType>(relu ? std::max(accumulators[i], 0.f)
+                                             : accumulators[i]);
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
+// broadcast. Weights are stored for this routine by making each 4x4 block
+// contiguous. Blocks are ordered in standard row-major format. column indices
+// are converted to deltas and then multiplied by 2 to convert to bytes, so
+// that the value can be used directly to offset the pointer into the rhs
+// vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMM5_4x4<WeightType, RhsType, OutType>::value>::type
+SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    float accumulators[4][5];
+    // Undo the divion by the happens for the assembly version.
+    for (int i = 0; i < 4; ++i) {
+      for (int k = 0; k < 5; ++k) {
+        accumulators[i][k] = 4.f * static_cast<float>(*bias_ptr);
+      }
+      ++bias_ptr;
+    }
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;
+      // multiply this 4x4 block
+      for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          for (int k = 0; k < 5; ++k) {
+            accumulators[i][k] += static_cast<float>(*weights_ptr) *
+                                  static_cast<float>(rhs_ptrs[k][j]);
+          }
+          weights_ptr++;
+        }
+      }
+    }
+    for (int k = 0; k < 5; ++k) {
+      for (int i = 0; i < 4; ++i) {
+        out_ptrs[k][0] = static_cast<OutType>(
+            relu ? std::max(accumulators[i][k], 0.f) : accumulators[i][k]);
+        out_ptrs[k]++;
+      }
+    }
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with
+// a 1x1 blocked pattern (ie unstructured), x is a
+// vector and b is vector.
+// Weights are stored for this routine in standard CSR format.  Each row must
+// have a multiple of 8 columns.
+// column indices are converted to deltas and then multiplied by 2 to convert
+// to bytes, so that the value can be used directly to offset the pointer
+// into the rhs vector.
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMV_1x1<WeightType, RhsType, OutType>::value>::type
+SpMV_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  for (int row = 0; row < assigned_rows; ++row) {
+    // Undo the divion by the happens for the assembly version.
+    float accumulator = 4.f * static_cast<float>(*bias_ptr++);
+    int col_count = *nnz_per_row++;
+    for (int c = 0; c < col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      accumulator +=
+          static_cast<float>(*weights_ptr++) * static_cast<float>(*rhs_ptr);
+    }
+    *out_ptr++ =
+        static_cast<OutType>(relu ? std::max(accumulator, 0.f) : accumulator);
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with
+// a 1x1 blocked pattern (ie unstructured), x is a
+// vector and b is vector.
+// Weights are stored for this routine in standard CSR format.  Each row must
+// have a multiple of 8 columns.
+// column indices are converted to deltas and then multiplied by 2 to convert
+// to bytes, so that the value can be used directly to offset the pointer
+// into the rhs vector.
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMM5_1x1<WeightType, RhsType, OutType>::value>::type
+SpMM5_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int row = 0; row < assigned_rows; ++row) {
+    // Undo the divion by the happens for the assembly version.
+    float accumulator[5];
+    for (int i = 0; i < 5; ++i)
+      accumulator[i] = 4.f * static_cast<float>(*bias_ptr);
+    ++bias_ptr;
+    int col_count = *nnz_per_row++;
+    for (int c = 0; c < col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int i = 0; i < 5; ++i) {
+        rhs_ptrs[i] += col_delta;
+        accumulator[i] += static_cast<float>(*weights_ptr) *
+                          static_cast<float>(rhs_ptrs[i][0]);
+      }
+      weights_ptr++;
+    }
+    for (int i = 0; i < 5; ++i) {
+      out_ptrs[i][0] = static_cast<OutType>(relu ? std::max(accumulator[i], 0.f)
+                                                 : accumulator[i]);
+      out_ptrs[i]++;
+    }
+  }
+}
+template <typename Type>
+typename std::enable_if<ShouldEnableGenericAdd<Type>::value>::type SumVectors(
+    int start, int end, const Type* add1, const Type* add2, Type* result) {
+  LOG_FIRST_N(WARNING, 1) << "SumVectors: using generic kernel!";
+  for (int i = start; i < end; ++i) {
+    Type sum = static_cast<Type>(static_cast<float>(add1[i]) +
+                                 static_cast<float>(add2[i]));
+    result[i] = sum;
+  }
+}
+}  // namespace detail
+}  // namespace csrblocksparse
+#undef LABEL_COL_LOOP
+#undef LABEL_ROW_LOOP
+#undef LABEL_SKIP_COL_LOOP
+#undef LABEL_TOP_LOOP
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_

sparse_matmul/compute/matmul.h ADDED Viewed

	@@ -0,0 +1,199 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_H_
+#include <cstdint>
+#include <vector>
+#include "absl/time/time.h"
+#include "sparse_matmul/compute/matmul_fixed_avx2.h"
+#include "sparse_matmul/compute/matmul_generic.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32)
+#include <cpuid.h>
+#endif
+namespace csrblocksparse {
+// The number of elements in a block.
+constexpr int kBlockSize = 4;
+// Base class for Matmul containing the members that are non type-specicfic.
+class MatmulBase {
+ public:
+  // Constructor initializes the flags that determine which implementation to
+  // use at run-time, constrained by both compiler flags and cpuid.
+  MatmulBase() {
+#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32)
+    // Code tested to work on Linux systems and multiple Android emulators.
+    unsigned int eax, ebx, ecx, edx;
+    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
+      using_avx_ = (ecx & bit_AVX) != 0;
+      if (using_avx_) {
+        __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+        using_avx2_ = (ebx & bit_AVX2) != 0;
+        using_avx512_ = (ebx & bit_AVX512F) != 0 && (ebx & bit_AVX512DQ) &&
+                        (ebx & bit_AVX512BW) != 0;
+        VLOG(2) << "avx2 flag=" << using_avx2_ << " 512=" << using_avx512_;
+      } else {
+        LOG(ERROR) << "AVX not found at all!";
+      }
+    }
+#else
+    using_aarch64_ = true;
+#endif
+  }
+ protected:
+  // Flags that define what (runtime) architectures are available. Flags that
+  // are set are limited by both the compiler flags and runtime environment.
+  bool using_avx512_ = false;
+  bool using_avx2_ = false;
+  bool using_avx_ = false;
+  bool using_aarch64_ = false;
+};
+// The master template is really a catch-all for the unimplmented cases to
+// report an error.
+template <typename WeightType, typename RhsType>
+class Matmul : public MatmulBase {
+ public:
+  // Sparse inputs, outputs replicated strided for each thread.
+  template <typename OutType>
+  void MatVec4x4(const WeightType* weights, const RhsType* rhs,
+                 const typename TypeOfProduct<WeightType, RhsType>::type* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, OutType* output) {
+    // The specializations should take care of every real case.
+    CHECK(false) << "Unsupported combination of types used!";
+  }
+  template <typename OutType>
+  void MatVec8x4(const WeightType* weights, const RhsType* rhs,
+                 const typename TypeOfProduct<WeightType, RhsType>::type* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, OutType* output) {
+    // The specializations should take care of every real case.
+    CHECK(false) << "Unsupported combination of types used!";
+  }
+};
+// Full specialization for float.
+template <>
+class Matmul<float, float> : public MatmulBase {
+ public:
+  void MatVec4x4(const float* weights, const float* rhs, const float* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, float* output) {
+    detail::MatVecFloatGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/4,
+                               /*block_width=*/4, relu, replicas, stride,
+                               output);
+  }
+  void MatVec8x4(const float* weights, const float* rhs, const float* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, float* output) {
+    detail::MatVecFloatGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/8,
+                               /*block_width=*/4, relu, replicas, stride,
+                               output);
+  }
+};
+// Partial specialization for fixed types. Covers fixed16xfixed16 = OutType,
+// where OutType should be fixed16 or fixed32. The mantissa bits don't have
+// to match.
+template <int WeightBits, int RhsBits>
+class Matmul<fixed16<WeightBits>, fixed16<RhsBits>> : public MatmulBase {
+ public:
+  using WeightType = fixed16<WeightBits>;
+  using RhsType = fixed16<RhsBits>;
+  template <typename OutType>
+  void MatVec4x4(const int16_t* weights, const int16_t* rhs,
+                 const int32_t* bias, const int32_t* nnz_per_row,
+                 const int16_t* rhs_indices, int start_row, int end_row,
+                 bool relu, int replicas, int stride, OutType* output) {
+    constexpr int kShiftAmount =
+        TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+        OutType::kMantissaBits;
+    static_assert(kShiftAmount >= 0,
+                  "OutType must not have more mantissa bits than inputs");
+#if defined __AVX2__
+    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
+    if (sizeof(*output) == 4) {
+      int32_t* out32 = reinterpret_cast<int32_t*>(output);
+      detail::MatVec4x4FixedAVX2(weights, rhs, bias, nnz_per_row, rhs_indices,
+                                 start_row, end_row, relu, kShiftAmount,
+                                 replicas, stride, out32);
+    } else {
+      int16_t* out16 = reinterpret_cast<int16_t*>(output);
+      detail::MatVec4x4FixedAVX2(weights, rhs, bias, nnz_per_row, rhs_indices,
+                                 start_row, end_row, relu, kShiftAmount,
+                                 replicas, stride, out16);
+    }
+#elif defined __aarch64__
+    if (using_aarch64_) {
+      LOG(FATAL) << "Fixed16 MatVec4x4 not yet implemented!";
+    }
+#else
+    detail::MatVecFixedGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/4,
+                               /*block_width=*/4, relu, sizeof(*output),
+                               kShiftAmount, replicas, stride, output);
+#endif  // __AVX2__
+  }
+  template <typename OutType>
+  void MatVec8x4(const int16_t* weights, const int16_t* rhs,
+                 const int32_t* bias, const int32_t* nnz_per_row,
+                 const int16_t* rhs_indices, int start_row, int end_row,
+                 bool relu, int replicas, int stride, OutType* output) {
+    constexpr int kShiftAmount =
+        TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+        OutType::kMantissaBits;
+    static_assert(kShiftAmount >= 0,
+                  "OutType must not have more mantissa bits than inputs");
+#if defined __AVX2__
+    CHECK(replicas == 1 && sizeof(*output) == 4)
+        << "Only replicas == 1 and fixed32 output are implemented for AVX2!";
+    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
+    int32_t* out32 = reinterpret_cast<int32_t*>(output);
+    detail::MatVec8x4FixedAVX2(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, relu, kShiftAmount, out32);
+#elif defined __aarch64__
+    if (using_aarch64_) {
+      LOG(FATAL) << "Fixed16 MatVec8x4 not yet implemented!";
+    }
+#else
+    detail::MatVecFixedGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/8,
+                               /*block_width=*/4, relu, sizeof(*output),
+                               kShiftAmount, replicas, stride, output);
+#endif  // __AVX2__
+  }
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_H_

sparse_matmul/compute/matmul_fixed_avx2.cc ADDED Viewed

	@@ -0,0 +1,235 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/matmul_fixed_avx2.h"
+#include <cstdint>
+#if defined __AVX__
+#include <immintrin.h>
+#endif
+#include "sparse_matmul/compute/matmul.h"
+namespace csrblocksparse {
+namespace detail {
+static const int32_t kint32min = static_cast<int32_t>(~0x7FFFFFFF);
+static const int32_t kint32max = static_cast<int32_t>(0x7FFFFFFF);
+#if defined __AVX2__
+// In-line function computes and returns the result of one row (of blocks) as
+// 4x int32_t. |weights_ptr| is a non-const reference so it can easily be
+// interpreted as belonging to the caller.
+inline __m256i ComputeRowResults(const __m128i& bias128, const int16_t* rhs,
+                                 const int16_t* rhs_indices, int nnz,
+                                 int16_t const*& weights_ptr) {
+  // Expand bias to 64 bits in a 256 bit register [0 z 1 z 2 z 3 z], where z is
+  // Zero and 0-3 are the 4x32 bit bias values.
+  __m256i sum = _mm256_cvtepu32_epi64(bias128);
+  for (int c = 0; c < nnz; ++c) {
+    int rhs_index = rhs_indices[c];
+    // Load all 16 weights.
+    __m256i weights =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+    // Get the 4x int16_t into the bottom of |rhs_64|.
+    __m128i rhs_64 = _mm_loadl_epi64(
+        reinterpret_cast<__m128i const*>(rhs + rhs_index * kBlockSize));
+    // Broadcast the rhs, pretending that each is a 64-bit unit:
+    // [0123 0123 0123 0123].
+    __m256i rhs_value = _mm256_broadcastq_epi64(rhs_64);
+    weights_ptr += 16;
+    sum = _mm256_add_epi32(sum, _mm256_madd_epi16(weights, rhs_value));
+  }
+  // Horizontally add the results. We have 1 register that contains results
+  // [0 0 1 1 2 2 3 3], but hadd (and almost no other AVX instruction) will not
+  // cross lanes, so we end up with [0 1 0 1 2 3 2 3]
+  sum = _mm256_hadd_epi32(sum, sum);
+  // Permutes the middle two pairs to get the answers together.
+  return _mm256_permute4x64_epi64(sum, 0xd8);
+}
+// Template that allows any fixed combination of OutType and replicas, plus
+// variable |relu|, |shift_out|. Note that |kReplicas| is a template arg as
+// well as a function arg so we can hard-code a limited amount of unrolling.
+template <typename OutType, int kReplicas>
+void MatVec4x4FixedAVX2Template(const int16_t* weights_ptr, const int16_t* rhs,
+                                const int32_t* bias, const int32_t* nnz_per_row,
+                                const int16_t* rhs_indices, int start_row,
+                                int end_row, bool relu, int shift_out,
+                                int replicas, int stride, OutType* output) {
+  int rounding_addon = shift_out > 0 ? (1 << (shift_out - 1)) : 0;
+  __m256i rounding = _mm256_set1_epi32(rounding_addon);
+  __m256i zero = relu ? _mm256_setzero_si256() : _mm256_set1_epi32(kint32min);
+  for (int row_block = start_row; row_block < end_row; ++row_block) {
+    // Load 4 biases [0 1 2 3].
+    __m128i bias128 = _mm_load_si128(reinterpret_cast<__m128i const*>(bias));
+    bias += kBlockSize;
+    int nnz = nnz_per_row[row_block];
+    __m256i sum =
+        ComputeRowResults(bias128, rhs, rhs_indices, nnz, weights_ptr);
+    rhs_indices += nnz;
+    // Shift right with rounding to get the right number of mantissa bits.
+    sum = _mm256_add_epi32(sum, rounding);
+    sum = _mm256_srai_epi32(sum, shift_out);
+    // Now sum contains [res0, res1, res2, res3, res0, res1, res2, res3]
+    sum = _mm256_max_epi32(sum, zero);
+    if (sizeof(OutType) == 2) {
+      // Clip to 16 bit range (with saturation) and pack in the bottom 64
+      // bits. The 64 bit result is replicated across the whole 256 bit
+      // register. [0123 0123 0123 0123]
+      sum = _mm256_packs_epi32(sum, sum);
+      int64_t result = _mm256_extract_epi64(sum, 0);
+      *reinterpret_cast<int64_t*>(output) = result;
+      if (kReplicas > 1) {
+        *reinterpret_cast<int64_t*>(output + stride) = result;
+        if (kReplicas > 2) {
+          for (int r = 2; r < replicas; ++r) {
+            *reinterpret_cast<int64_t*>(output + r * stride) = result;
+          }
+        }
+      }
+    } else {
+      // Save the lower 128 bits (4x int32_t).
+      __m128i result = _mm256_extractf128_si256(sum, 0);
+      _mm_store_si128(reinterpret_cast<__m128i*>(output), result);
+      if (kReplicas > 1) {
+        _mm_store_si128(reinterpret_cast<__m128i*>(output + stride), result);
+        if (kReplicas > 2) {
+          for (int r = 2; r < replicas; ++r) {
+            _mm_store_si128(reinterpret_cast<__m128i*>(output + r * stride),
+                            result);
+          }
+        }
+      }
+    }
+    output += kBlockSize;
+  }
+}
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int16_t |output|.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int16_t* output) {
+  if (replicas <= 1) {
+    MatVec4x4FixedAVX2Template<int16_t, 1>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 1, stride, output);
+  } else if (replicas == 2) {
+    MatVec4x4FixedAVX2Template<int16_t, 2>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 2, stride, output);
+  } else {
+    MatVec4x4FixedAVX2Template<int16_t, 3>(
+        weights_ptr, rhs, bias, nnz_per_row, rhs_indices, start_row, end_row,
+        relu, shift_out, replicas, stride, output);
+  }
+}
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int32_t |output|.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int32_t* output) {
+  if (replicas <= 1) {
+    MatVec4x4FixedAVX2Template<int32_t, 1>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 1, stride, output);
+  } else if (replicas == 2) {
+    MatVec4x4FixedAVX2Template<int32_t, 2>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 2, stride, output);
+  } else {
+    MatVec4x4FixedAVX2Template<int32_t, 3>(
+        weights_ptr, rhs, bias, nnz_per_row, rhs_indices, start_row, end_row,
+        relu, shift_out, replicas, stride, output);
+  }
+}
+// In-line function computes and returns the result of one row (of blocks) as
+// 8x int32_t. weights_ptr is a non-const reference so it can easily be
+// interpreted as belonging to the caller.
+inline __m256i Compute8RowResults(const __m256i& bias256, const int16_t* rhs,
+                                  const int16_t* rhs_indices, int nnz,
+                                  int16_t const*& weights_ptr) {
+  // Expand bias to 64 bits in a 256 bit register [0 z 1 z 2 z 3 z], where z is
+  // Zero and 0-3 are the 4x32 bit bias values from 128 bit half of the input.
+  __m256i sum1 = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(bias256));
+  // Plus 4 more in another sum register from the upper 128 bit half.
+  __m256i sum2 = _mm256_cvtepu32_epi64(_mm256_extractf128_si256(bias256, 1));
+  for (int c = 0; c < nnz; ++c) {
+    int rhs_index = rhs_indices[c];
+    // Load all 16 weights.
+    __m256i weights =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+    // Get the 4x int16_t into the bottom of |rhs_64|.
+    __m128i rhs_64 = _mm_loadl_epi64(
+        reinterpret_cast<__m128i const*>(rhs + rhs_index * kBlockSize));
+    // Broadcast the rhs, pretending that each is a 64-bit unit:
+    // [0123 0123 0123 0123].
+    __m256i rhs_value = _mm256_broadcastq_epi64(rhs_64);
+    weights_ptr += 16;
+    sum1 = _mm256_add_epi32(sum1, _mm256_madd_epi16(weights, rhs_value));
+    // Same again for the other 4 results, re-using the same rhs value.
+    weights = _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+    weights_ptr += 16;
+    sum2 = _mm256_add_epi32(sum2, _mm256_madd_epi16(weights, rhs_value));
+  }
+  // Horizontally add the results. We have 2 registers that contain results
+  // [0 0 1 1 2 2 3 3], and [4 4 5 5 6 6 7 7] but hadd (and almost no other AVX
+  // instruction) will not cross lanes, so we end up with [0 1 4 5 2 3 6 7]
+  sum1 = _mm256_hadd_epi32(sum1, sum2);
+  // Permutes the middle two pairs to get the answers in the right order.
+  return _mm256_permute4x64_epi64(sum1, 0xd8);
+}
+// Version that covers the main conditions used with 8x4:
+// |relu|, |shift_out|, with int32_t |output|.
+void MatVec8x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int32_t* output) {
+  int rounding_addon = shift_out > 0 ? (1 << (shift_out - 1)) : 0;
+  __m256i rounding = _mm256_set1_epi32(rounding_addon);
+  __m256i zero = relu ? _mm256_setzero_si256() : _mm256_set1_epi32(kint32min);
+  for (int row_block = start_row; row_block < end_row; ++row_block) {
+    // Load 4 biases [0 1 2 3 4 5 6 7].
+    __m256i bias256 = _mm256_load_si256(reinterpret_cast<__m256i const*>(bias));
+    bias += kBlockSize * 2;
+    int nnz = nnz_per_row[row_block];
+    __m256i sum =
+        Compute8RowResults(bias256, rhs, rhs_indices, nnz, weights_ptr);
+    rhs_indices += nnz;
+    // Shift right with rounding to get the right number of mantissa bits.
+    sum = _mm256_add_epi32(sum, rounding);
+    sum = _mm256_srai_epi32(sum, shift_out);
+    // Now sum contains [res0, res1, res2, res3, res0, res1, res2, res3]
+    sum = _mm256_max_epi32(sum, zero);
+    // Save the all 256 bits (8x int32_t).
+    _mm256_store_si256(reinterpret_cast<__m256i*>(output), sum);
+    output += kBlockSize * 2;
+  }
+}
+#endif
+}  // namespace detail
+}  // namespace csrblocksparse

sparse_matmul/compute/matmul_fixed_avx2.h ADDED Viewed

	@@ -0,0 +1,49 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_FIXED_AVX2_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_FIXED_AVX2_H_
+#include <cstdint>
+namespace csrblocksparse {
+namespace detail {
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int16 output.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int16_t* output);
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int32 output.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int32_t* output);
+// Version that covers the main conditions used with 8x4:
+// |relu|, |shift_out|, with int32 output.
+void MatVec8x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int32_t* output);
+}  // namespace detail
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_FIXED_AVX2_H_

sparse_matmul/compute/matmul_generic.cc ADDED Viewed

	@@ -0,0 +1,122 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/matmul_generic.h"
+#include <cstdint>
+#include <vector>
+#include "sparse_matmul/compute/matmul.h"
+namespace csrblocksparse {
+namespace detail {
+void MatVecFloatGeneric(const float* weights, const float* rhs,
+                        const float* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int replicas, int stride, float* output) {
+  int weight_index = 0;
+  int bias_index = 0;
+  std::vector<float> accumulators(block_height);
+  for (int row_block = start_row; row_block < end_row;
+       ++row_block, output += block_height) {
+    int nnz = nnz_per_row[row_block];
+    // Biases are now stored and used directly without pre-division.
+    for (int i = 0; i < block_height; ++i) accumulators[i] = bias[bias_index++];
+    for (int c = 0; c < nnz; ++c) {
+      int rhs_index = rhs_indices[c];
+      const float* block_rhs = rhs + rhs_index * block_width;
+      // Multiply this |block_height| x |block_width| block.
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          accumulators[i] += weights[weight_index++] * block_rhs[j];
+        }
+      }
+    }
+    rhs_indices += nnz;
+    // Apply relu if desired.
+    if (relu) {
+      for (int i = 0; i < block_height; ++i) {
+        if (accumulators[i] < 0) accumulators[i] = 0;
+      }
+    }
+    for (int r = 0; r < replicas; ++r) {
+      for (int i = 0; i < block_height; ++i) {
+        output[i + r * stride] = accumulators[i];
+      }
+    }
+  }
+}
+void MatVecFixedGeneric(const int16_t* weights, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int bytes_out, int shift_out, int replicas, int stride,
+                        void* output) {
+  int weight_index = 0;
+  int bias_index = 0;
+  std::vector<int32_t> accumulators(block_height);
+  for (int row_block = start_row; row_block < end_row; ++row_block) {
+    int nnz = nnz_per_row[row_block];
+    // Biases are now stored and used directly without pre-division.
+    for (int i = 0; i < block_height; ++i) accumulators[i] = bias[bias_index++];
+    for (int c = 0; c < nnz; ++c) {
+      int rhs_index = rhs_indices[c];
+      const int16_t* block_rhs = rhs + rhs_index * block_width;
+      // Multiply this |block_height| x |block_width| block.
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          accumulators[i] += weights[weight_index++] * block_rhs[j];
+        }
+      }
+    }
+    rhs_indices += nnz;
+    // Apply relu if desired.
+    if (relu) {
+      for (int i = 0; i < block_height; ++i) {
+        if (accumulators[i] < 0) accumulators[i] = 0;
+      }
+    }
+    // Output shift.
+    if (shift_out > 0) {
+      for (int i = 0; i < block_height; ++i) {
+        accumulators[i] >>= shift_out;
+      }
+    }
+    if (bytes_out == 2) {
+      int16_t* out16 = reinterpret_cast<int16_t*>(output);
+      output = out16 + block_height;
+      for (int r = 0; r < replicas; ++r, out16 += stride) {
+        for (int i = 0; i < block_height; ++i) {
+          out16[i] = accumulators[i];
+        }
+      }
+    } else {
+      int32_t* out32 = reinterpret_cast<int32_t*>(output);
+      output = out32 + block_height;
+      for (int r = 0; r < replicas; ++r, out32 += stride) {
+        for (int i = 0; i < block_height; ++i) {
+          out32[i] = accumulators[i];
+        }
+      }
+    }
+  }
+}
+}  // namespace detail
+}  // namespace csrblocksparse

sparse_matmul/compute/matmul_generic.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_GENERIC_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_GENERIC_H_
+#include <cstdint>
+namespace csrblocksparse {
+namespace detail {
+// Generic version uses plain C++ code.
+void MatVecFloatGeneric(const float* weights, const float* rhs,
+                        const float* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int replicas, int stride, float* output);
+void MatVecFixedGeneric(const int16_t* weights, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int bytes_out, int shift_out, int replicas, int stride,
+                        void* output);
+}  // namespace detail
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_GENERIC_H_

sparse_matmul/compute/thread_bounds.cc ADDED Viewed

	@@ -0,0 +1,106 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/thread_bounds.h"
+#include <vector>
+#include "glog/logging.h"
+namespace csrblocksparse {
+void ThreadBounds::PrepareForThreads(int block_width, int block_height,
+                                     int num_threads,
+                                     int reduced_rows_per_cache_row,
+                                     int reduced_rows, const int* nnz_per_row) {
+  CHECK_GT(num_threads, 0);
+  block_width_ = block_width;
+  block_height_ = block_height;
+  ComputeThreadSplitPoints(num_threads, reduced_rows_per_cache_row,
+                           reduced_rows, nnz_per_row);
+  weight_starts_.clear();
+  rhs_indices_starts_.clear();
+  bias_starts_.clear();
+  weight_starts_.reserve(row_starts_.size());
+  rhs_indices_starts_.reserve(row_starts_.size());
+  bias_starts_.reserve(row_starts_.size());
+  // Compute the start indices of each of the types, given what we know about
+  // padding, and number of |nnz_per_row|.
+  int weight_index = 0;
+  int rhs_indices_index = 0;
+  int bias_index = 0;
+  int row = 0;
+  for (int start : row_starts_) {
+    while (row < start) {
+      weight_index += nnz_per_row[row] * block_width_ * block_height_;
+      rhs_indices_index += nnz_per_row[row];
+      bias_index += block_height_;
+      ++row;
+    }
+    weight_starts_.push_back(weight_index);
+    rhs_indices_starts_.push_back(rhs_indices_index);
+    bias_starts_.push_back(bias_index);
+  }
+}
+// Computes the block row (reduced) index of the start of each thread.
+void ThreadBounds::ComputeThreadSplitPoints(int num_threads,
+                                            int reduced_rows_per_cache_row,
+                                            int reduced_rows,
+                                            const int* nnz_per_row) {
+  row_starts_.assign(/*n=*/1, /*val=*/0);
+  // Break the rule if the matrix is too small to allow one per thread, which
+  // occurs only during tests.
+  if (reduced_rows_per_cache_row * num_threads > reduced_rows)
+    reduced_rows_per_cache_row = std::max(reduced_rows / num_threads, 1);
+  int cache_rows = (reduced_rows + reduced_rows_per_cache_row - 1) /
+                   reduced_rows_per_cache_row;
+  // Compute exclusive prefix sum of the amount of work per row.
+  std::vector<int> work_upto_row(cache_rows + 1, 0);
+  int extra_row_work = 2 * reduced_rows_per_cache_row;
+  for (int i = 0; i < cache_rows; ++i) {
+    int new_nnz = 0;
+    for (int j = 0; j < reduced_rows_per_cache_row; ++j) {
+      // if |reduced_rows_per_cache_row| isn't an exact multiple of the
+      // matrix size, then we need to be careful here.
+      int index = i * reduced_rows_per_cache_row + j;
+      if (index < reduced_rows) new_nnz += nnz_per_row[index];
+    }
+    work_upto_row[i + 1] = new_nnz + extra_row_work + work_upto_row[i];
+  }
+  int total_work = work_upto_row.back();
+  // Find the split point point based on assigned approximately equal amount
+  // of work for each thread.
+  int prev_split = 0;
+  for (int i = 1; i <= num_threads; ++i) {
+    int split = std::distance(
+        work_upto_row.begin(),
+        std::lower_bound(work_upto_row.begin(), work_upto_row.end(),
+                         i * total_work / num_threads));
+    int split_row = split * reduced_rows_per_cache_row;
+    if (i == num_threads) {
+      split_row = reduced_rows;
+    }
+    VLOG(2) << "tid=" << i - 1 << " num rows=" << split_row - row_starts_.back()
+            << " work=" << work_upto_row[split] - work_upto_row[prev_split];
+    row_starts_.push_back(split_row);
+    prev_split = split;
+  }
+  VLOG(2) << "total rows=" << reduced_rows << " total work=" << total_work;
+}
+}  // namespace csrblocksparse

sparse_matmul/compute/thread_bounds.h ADDED Viewed

	@@ -0,0 +1,74 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_THREAD_BOUNDS_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_THREAD_BOUNDS_H_
+#include <vector>
+namespace csrblocksparse {
+// Class to compute and store the bounds of each thread used in a computation,
+// and to provide corresponding spans of vectors.
+class ThreadBounds {
+ public:
+  ThreadBounds() : block_width_(0), block_height_(0) {}
+  void PrepareForThreads(int block_width, int block_height, int num_threads,
+                         int reduced_rows_per_cache_row, int reduced_rows,
+                         const int* nnz_per_row);
+  // Functions that offset the appropriate type to the start of the data
+  // needed by the given thread id (|tid|).
+  template <typename WeightType>
+  const WeightType* OffsetWeights(const WeightType* weights, int tid) const {
+    return weights + weight_starts_[tid];
+  }
+  template <typename RhsIndType>
+  const RhsIndType* OffsetRhsIndices(const RhsIndType* rhs_indices,
+                                     int tid) const {
+    return rhs_indices + rhs_indices_starts_[tid];
+  }
+  template <typename BiasType>
+  const BiasType* OffsetBias(const BiasType* bias, int tid) const {
+    return bias + bias_starts_[tid];
+  }
+  template <typename OutType>
+  OutType* OffsetOutput(OutType* output, int tid) const {
+    return output + block_height_ * row_starts_[tid];
+  }
+  int StartRow(int tid) const { return row_starts_[tid]; }
+  const std::vector<int>& row_starts() const { return row_starts_; }
+ private:
+  // Computes the block row (reduced) index of the start of each thread.
+  void ComputeThreadSplitPoints(int num_threads, int reduced_rows_per_cache_row,
+                                int reduced_rows, const int* nnz_per_row);
+  // Sizes of a sparse block.
+  int block_width_;
+  int block_height_;
+  // Start indices of each data type by thread-id with an extra value at the
+  // end.
+  std::vector<int> row_starts_;
+  std::vector<int> weight_starts_;
+  std::vector<int> rhs_indices_starts_;
+  std::vector<int> bias_starts_;
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_THREAD_BOUNDS_H_

sparse_matmul/layers/BUILD ADDED Viewed

	@@ -0,0 +1,146 @@

+# Sparse/Masked Matrix and Layer.
+# [internal] load android_library_selector
+# [internal] load android_cc_test:def.bzl
+licenses(["notice"])
+cc_library(
+    name = "layer",
+    hdrs = [
+        "sparse_linear_layer.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        ":matrix",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/os:coop_threads",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_glog//:glog",
+    ],
+)
+cc_library(
+    name = "matrix",
+    hdrs = [
+        "csr_blocksparse_matrix.h",
+        "masked_sparse_matrix.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "//sparse_matmul/compute:kernels",
+        "//sparse_matmul/compute:matmul",
+        "//sparse_matmul/compute:thread_bounds",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/os:coop_threads",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_glog//:glog",
+    ],
+)
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils.cc",
+    ],
+    hdrs = [
+        "read_array_ifstream.h",
+        "utils.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        ":layer",
+        ":matrix",
+        ":status",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "//sparse_matmul/zlib_wrapper",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@gulrak_filesystem//:filesystem",
+    ],
+)
+cc_library(
+    name = "status",
+    srcs = [
+        "errno_mapping.cc",
+    ],
+    hdrs = [
+        "errno_mapping.h",
+        "status_macros.h",
+    ],
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+    ],
+)
+cc_test(
+    name = "csrblocksparse_test",
+    size = "small",
+    srcs = [
+        "csrblocksparse_test.cc",
+    ],
+    data = glob(["testdata/*"]),
+    linkopts = select({
+        "@bazel_tools//platforms:android": ["-landroid"],
+        "//conditions:default": [],
+    }),
+    shard_count = 10,
+    deps = [
+        ":status",
+        ":utils",
+        "//sparse_matmul/compute:matmul",
+        "//sparse_matmul/numerics:test_utils",
+        "//sparse_matmul/os:coop_threads",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@gulrak_filesystem//:filesystem",
+    ],
+)
+cc_test(
+    name = "sparse_linear_layer_test",
+    srcs = [
+        "sparse_linear_layer_test.cc",
+    ],
+    deps = [
+        ":layer",
+        "//sparse_matmul/numerics:test_utils",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":layer",
+        ":matrix",
+        ":status",
+        ":utils",
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:test_utils",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_googletest//:gtest_main",
+        "@gulrak_filesystem//:filesystem",
+    ],
+)

sparse_matmul/layers/csr_blocksparse_matrix.h ADDED Viewed

	@@ -0,0 +1,835 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "glog/logging.h"
+// IWYU pragma: begin_exports
+#include "sparse_matmul/compute/kernels_generic.h"
+#include "sparse_matmul/compute/matmul.h"
+#include "sparse_matmul/compute/thread_bounds.h"
+#include "sparse_matmul/layers/masked_sparse_matrix.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/float16_types.h"
+#include "sparse_matmul/os/coop_threads.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+// IWYU pragma: end_exports
+#include "absl/memory/memory.h"
+namespace csrblocksparse {
+// CsrBlockSparseMatrix stores a modified block compressed sparse row
+// representation of a sparse matrix.  The ordering of the weights is modified
+// in the 16x1 and 1x1 cases so that a certain number (4 and 8 respectively)
+// of columns of weights are stored contiguously before moving on to the next
+// row.  The 4x4 case stores each block contiguously.
+//
+// Currently it is constructed from a MaskedSparseMatrix which usees a dense
+// binary mask representation.  The construction generates the compressed
+// representation.  Further iterations will support a direct serialization
+// of the compressed representation.
+//
+// MaskedSparseMatrix masked_matrix(rows, cols, existing_mask, existing_values)
+// CsrBlockSparseMatrix matrix(masked_matrix)
+//
+// matrix.SpMV_bias(rhs, bias, &out);
+//
+// This class is thread compatible.
+template <typename WeightType, typename RhsType, typename DeltaType = int16_t>
+class CsrBlockSparseMatrix {
+ public:
+  CsrBlockSparseMatrix() {}
+  // Reference used to indicate that this is an input and not an output.
+  CsrBlockSparseMatrix(const uint8_t* const& buffer, const std::size_t& len) {
+    ReadFromFlatBuffer(buffer, len);
+    ComputeRHSIndices();
+  }
+  template <typename InputType>
+  CsrBlockSparseMatrix(const MaskedSparseMatrix<InputType>& masked_matrix) {
+    sparsity_ = masked_matrix.sparsity();
+    rows_ = masked_matrix.rows();
+    cols_ = masked_matrix.cols();
+    DetermineBlockSize(masked_matrix);
+    if (block_width_ == 1 && block_height_ == 1)
+      col_multiple_ = 8;
+    else
+      col_multiple_ = 1;
+    std::vector<InputType> weights(masked_matrix.values().begin(),
+                                   masked_matrix.values().end());
+    reduced_rows_ = (rows_ + block_height_ - 1) / block_height_;
+    rows_ = reduced_rows_ * block_height_;
+    reduced_cols_ = cols_ / block_width_;
+    // Calculate the reduced CSR representation of the matrix.
+    std::vector<int> reduced_mask(reduced_rows_ * reduced_cols_);
+    std::vector<int> row_offsets = {0};
+    int nnz = 0;
+    const auto& mask = masked_matrix.mask();
+    for (int r = 0; r < reduced_rows_; ++r) {
+      for (int c = 0; c < reduced_cols_; ++c) {
+        int mask_val = mask[r * block_height_ * cols_ + c * block_width_];
+        reduced_mask[r * reduced_cols_ + c] = mask_val;
+        nnz += mask_val;
+      }
+      row_offsets.push_back(nnz);
+    }
+    // Make sure the reduced representation has the correct number of columns.
+    MakeColumnsMultiple(row_offsets, &reduced_mask, &weights);
+    std::vector<int> col_indices;
+    std::vector<WeightType> weights_csr;
+    std::vector<int> nnz_per_row;
+    MaskAndWeightsToCsr(reduced_mask, weights, &nnz_per_row, &col_indices,
+                        &weights_csr);
+    // Generate column deltas from |col_indices|.
+    std::vector<DeltaType> col_deltas;
+    for (int i = 0; i < col_indices.size(); ++i) {
+      // |col_indices| are used to index the RHS vector which is always float.
+      int64_t diff = sizeof(RhsType);
+      if (i == 0)
+        diff *= block_width_ * (col_indices[i]);
+      else
+        diff *= block_width_ * (col_indices[i] - col_indices[i - 1]);
+      CHECK(diff < std::numeric_limits<DeltaType>::max())
+          << "delta between column indices in bytes " << diff
+          << " exceeded the maximum size of the DeltaType "
+          << std::numeric_limits<DeltaType>::max();
+      col_deltas.push_back(static_cast<DeltaType>(diff));
+    }
+    // Because of pre-fetching we need some extra values at the end.
+    col_deltas.insert(col_deltas.end(), std::max(2, col_multiple_ + 1), 0);
+    nnz_per_row.insert(nnz_per_row.end(), 2, nnz_per_row.back());
+    weights_ = CacheAlignedVector<WeightType>(weights_csr);
+    col_deltas_ = CacheAlignedVector<DeltaType>(col_deltas);
+    nnz_per_row_ = CacheAlignedVector<int>(nnz_per_row);
+    ComputeRHSIndices();
+    num_threads_ = 0;
+    PrepareForThreads(1);
+  }
+  // Constructor makes a matrix from the given weights, deltas and nnz, taking
+  // the other parameters from |src_matrix|. |cols| is the number of raw columns
+  // (NOT blocks) of the new matrix.
+  CsrBlockSparseMatrix(
+      const CsrBlockSparseMatrix<WeightType, RhsType, DeltaType>& src_matrix,
+      const std::vector<WeightType>& new_weights,
+      const std::vector<DeltaType>& new_deltas, const std::vector<int>& new_nnz,
+      int cols) {
+    num_threads_ = 0;
+    col_multiple_ = src_matrix.col_multiple_;
+    block_width_ = src_matrix.block_width_;
+    block_height_ = src_matrix.block_height_;
+    reduced_rows_ = new_nnz.size();
+    rows_ = reduced_rows_ * block_height_;
+    cols_ = cols;
+    reduced_cols_ = cols_ / block_width_;
+    weights_ = CacheAlignedVector<WeightType>(new_weights);
+    col_deltas_ = CacheAlignedVector<DeltaType>(new_deltas);
+    nnz_per_row_ = CacheAlignedVector<int>(new_nnz);
+    sparsity_ = 1.0f - static_cast<float>(new_weights.size()) / (rows_ * cols_);
+    ComputeRHSIndices();
+    name_ = src_matrix.name_;
+    PrepareForThreads(1);
+  }
+  // Factory method takes a column slice out of *this and returns a sparse
+  // matrix that takes as inputs [|start_col|, |end_col|) of *this, and
+  // returns the same number of outputs, but only a partial result.
+  // If |keep_rhs_size|, then the new matrix takes the same rhs as the current
+  // matrix, but uses a subset of it, instead of expecting just the reduced rhs.
+  // If |start_col| > |end_col|, then we slice out the complement of the defined
+  // interval, ie [0, |end_col|) + [|start_col|, current end).
+  // NOTE That |start_col| and |end_col| are in raw column coordinates, NOT
+  // block units.
+  CsrBlockSparseMatrix SplitByColumn(int start_col, int end_col,
+                                     bool keep_rhs_size = false) const {
+    int weight_index = 0;
+    int delta_index = 0;
+    std::vector<DeltaType> new_deltas;
+    std::vector<WeightType> new_weights;
+    std::vector<int> new_nnz(reduced_rows_);
+    int col = 0;
+    int prev_col = keep_rhs_size ? 0 : start_col;
+    for (int r = 0; r < reduced_rows_; ++r) {
+      int reduced_col_count = nnz_per_row_[r];
+      for (int c = 0; c < reduced_col_count; ++c, ++delta_index) {
+        col += col_deltas_[delta_index] / sizeof(RhsType);
+        if ((start_col < end_col && start_col <= col && col < end_col) ||
+            (start_col > end_col && (col < end_col || col >= start_col))) {
+          ++new_nnz[r];
+          new_deltas.push_back((col - prev_col) * sizeof(RhsType));
+          prev_col = col;
+          for (int i = 0; i < block_width_ * block_height_;
+               ++i, ++weight_index) {
+            new_weights.push_back(weights_[weight_index]);
+          }
+        } else {
+          weight_index += block_width_ * block_height_;
+        }
+      }
+    }
+    int new_cols = keep_rhs_size ? cols_ : end_col - start_col;
+    return CsrBlockSparseMatrix(*this, new_weights, new_deltas, new_nnz,
+                                new_cols);
+  }
+  // Factory method takes a row slice out of *this and returns a sparse
+  // matrix that takes the sampe inputs as *this, and returns the outputs for
+  // the range [|start_row|, |end_row|).
+  // NOTE That |start_row| and |end_row| are in raw column coordinates, NOT
+  // block units.
+  CsrBlockSparseMatrix SplitByRow(int start_row, int end_row) const {
+    int start_reduced = start_row / block_height_;
+    int end_reduced = end_row / block_height_;
+    std::vector<int> new_nnz(nnz_per_row_.data() + start_reduced,
+                             nnz_per_row_.data() + end_reduced);
+    int weight_start = 0;
+    for (int r = 0; r < start_reduced; ++r) {
+      weight_start += nnz_per_row_[r];
+    }
+    int weight_end = weight_start;
+    for (int r = start_reduced; r < end_reduced; ++r) {
+      weight_end += nnz_per_row_[r];
+    }
+    int delta_start = 0;
+    for (int i = 0; i < weight_start; ++i) {
+      delta_start += col_deltas_[i];
+    }
+    std::vector<DeltaType> new_deltas(col_deltas_.data() + weight_start,
+                                      col_deltas_.data() + weight_end);
+    new_deltas[0] += delta_start;
+    int block_size = block_height_ * block_width_;
+    std::vector<WeightType> new_weights(
+        weights_.data() + weight_start * block_size,
+        weights_.data() + weight_end * block_size);
+    return CsrBlockSparseMatrix(*this, new_weights, new_deltas, new_nnz, cols_);
+  }
+  // Combines adjacent row blocks, doubling the block height.
+  // This necessarily involves adding zero weights where the blocks don't align
+  // across adjacent pairs of rows, so use with caution, as the resulting matrix
+  // is most likely to run slower if very sparse to begin with.
+  // In the few cases where the blocks do mostly align, the resulting matmul
+  // could be much faster, as the number of reads of the rhs will be halved.
+  void DoubleBlockHeight() {
+    int new_rows = reduced_rows_ / 2;
+    std::vector<int> new_nnz(new_rows);
+    std::vector<DeltaType> new_rhs_indices;
+    std::vector<WeightType> new_weights;
+    int rhs_index1 = 0;
+    int rhs_index2 = 0;
+    int block_size = block_height_ * block_width_;
+    for (int r = 0; r < new_rows; ++r) {
+      int start_nnz = new_rhs_indices.size();
+      rhs_index2 += nnz_per_row_[r * 2];
+      int end1 = rhs_index1 + nnz_per_row_[r * 2];
+      int end2 = rhs_index2 + nnz_per_row_[r * 2 + 1];
+      // Run over a pair of rows with 2 iterators, combining blocks as we go, or
+      // padding with zeros where the block positions don't match.
+      while (rhs_index1 < end1 || rhs_index2 < end2) {
+        int col1 = rhs_index1 < end1 ? rhs_indices_[rhs_index1] : reduced_cols_;
+        int col2 = rhs_index2 < end2 ? rhs_indices_[rhs_index2] : reduced_cols_;
+        if (col1 < col2) {
+          // Need zero weights for row2 to pad out weights block.
+          new_rhs_indices.push_back(col1);
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index1 * block_size,
+                             weights_.data() + (rhs_index1 + 1) * block_size);
+          new_weights.insert(new_weights.end(), block_size,
+                             static_cast<WeightType>(0.0f));
+          ++rhs_index1;
+        } else if (col1 > col2) {
+          // Need zero weights for row1 to pad out weights block.
+          new_rhs_indices.push_back(col2);
+          new_weights.insert(new_weights.end(), block_size,
+                             static_cast<WeightType>(0.0f));
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index2 * block_size,
+                             weights_.data() + (rhs_index2 + 1) * block_size);
+          ++rhs_index2;
+        } else {
+          // Combine weights for both row1 and row2.
+          new_rhs_indices.push_back(col1);
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index1 * block_size,
+                             weights_.data() + (rhs_index1 + 1) * block_size);
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index2 * block_size,
+                             weights_.data() + (rhs_index2 + 1) * block_size);
+          ++rhs_index1;
+          ++rhs_index2;
+        }
+      }
+      rhs_index1 = rhs_index2;
+      new_nnz[r] = new_rhs_indices.size() - start_nnz;
+    }
+    block_height_ *= 2;
+    reduced_rows_ /= 2;
+    weights_ = CacheAlignedVector<WeightType>(new_weights);
+    rhs_indices_ = CacheAlignedVector<DeltaType>(new_rhs_indices);
+    nnz_per_row_ = CacheAlignedVector<int>(new_nnz);
+    sparsity_ = 1.0f - static_cast<float>(new_weights.size()) / (rows_ * cols_);
+    ComputeColDeltas();
+    if (num_threads_ > 0) {
+      int num_threads = num_threads_;
+      num_threads_ = 0;
+      PrepareForThreads(num_threads);
+    }
+  }
+  // Allocates memory and fills buffer.
+  // Caller is responsible for the memory de-allocation.
+  // TODO(b/189958858): Both Read and Write need to eventually handle the
+  // different possible HalfType and DeltaType values, but punting for now as
+  // there is only one supported combination.
+  std::size_t WriteToFlatBuffer(std::string* csr_flatbuffer) {
+    std::size_t bytes = 0;
+    bytes += FixedParameterSize();
+    bytes += weights_.size() * sizeof(WeightType);
+    bytes += col_deltas_.size() * sizeof(DeltaType);
+    bytes += nnz_per_row_.size() * sizeof(int);
+    uint8_t* bytes_ptr_ptr =
+        reinterpret_cast<uint8_t*>(CHECK_NOTNULL(malloc(bytes)));
+    int* int_bytes_ptr = reinterpret_cast<int*>(bytes_ptr_ptr);
+    *int_bytes_ptr++ = rows_;
+    *int_bytes_ptr++ = cols_;
+    *int_bytes_ptr++ = reduced_rows_;
+    *int_bytes_ptr++ = reduced_cols_;
+    *int_bytes_ptr++ = block_width_;
+    *int_bytes_ptr++ = block_height_;
+    *int_bytes_ptr++ = col_multiple_;
+    *int_bytes_ptr++ = num_threads_;
+    *int_bytes_ptr++ = weights_.size();
+    *int_bytes_ptr++ = col_deltas_.size();
+    *int_bytes_ptr++ = nnz_per_row_.size();
+    float* float_bytes_ptr = reinterpret_cast<float*>(int_bytes_ptr);
+    *float_bytes_ptr++ = sparsity_;
+    uint8_t* bytes_ptr = reinterpret_cast<uint8_t*>(float_bytes_ptr);
+    memcpy(bytes_ptr, weights_.data(), weights_.size() * sizeof(WeightType));
+    bytes_ptr += weights_.size() * sizeof(WeightType);
+    memcpy(bytes_ptr, col_deltas_.data(),
+           col_deltas_.size() * sizeof(DeltaType));
+    bytes_ptr += col_deltas_.size() * sizeof(DeltaType);
+    memcpy(bytes_ptr, nnz_per_row_.data(), nnz_per_row_.size() * sizeof(int));
+    bytes_ptr += nnz_per_row_.size() * sizeof(int);
+    csr_flatbuffer->resize(bytes);
+    csr_flatbuffer->assign(reinterpret_cast<char*>(bytes_ptr_ptr), bytes);
+    free(bytes_ptr_ptr);
+    return bytes;
+  }
+  void ReadFromFlatBuffer(const uint8_t* const& bytes, const std::size_t& len) {
+    CHECK_GE(len, FixedParameterSize());
+    const int* int_bytes_ptr = reinterpret_cast<const int*>(bytes);
+    rows_ = *int_bytes_ptr++;
+    cols_ = *int_bytes_ptr++;
+    reduced_rows_ = *int_bytes_ptr++;
+    reduced_cols_ = *int_bytes_ptr++;
+    block_width_ = *int_bytes_ptr++;
+    block_height_ = *int_bytes_ptr++;
+    col_multiple_ = *int_bytes_ptr++;
+    int num_threads = *int_bytes_ptr++;
+    int32_t weights_size = *int_bytes_ptr++;
+    int32_t col_deltas_size = *int_bytes_ptr++;
+    int32_t nnz_per_row_size = *int_bytes_ptr++;
+    // Make sure negative sizes don't mess things up.
+    weights_size = std::max(0, weights_size);
+    col_deltas_size = std::max(0, col_deltas_size);
+    nnz_per_row_size = std::max(0, nnz_per_row_size);
+    const float* float_bytes_ptr =
+        reinterpret_cast<const float*>(int_bytes_ptr);
+    sparsity_ = *float_bytes_ptr++;
+    std::size_t total_bytes =
+        FixedParameterSize() + weights_size * sizeof(WeightType) +
+        col_deltas_size * sizeof(DeltaType) + nnz_per_row_size * sizeof(int);
+    CHECK_EQ(total_bytes, len)
+        << "total bytes: " << total_bytes << ", actual len given: " << len;
+    const uint8_t* bytes_ptr =
+        reinterpret_cast<const uint8_t*>(float_bytes_ptr);
+    std::vector<WeightType> weights_raw(weights_size);
+    memcpy(weights_raw.data(), bytes_ptr, weights_size * sizeof(WeightType));
+    weights_ = CacheAlignedVector<WeightType>(weights_raw);
+    bytes_ptr += weights_size * sizeof(WeightType);
+    std::vector<DeltaType> deltas_raw(col_deltas_size);
+    memcpy(deltas_raw.data(), bytes_ptr, col_deltas_size * sizeof(DeltaType));
+    col_deltas_ = CacheAlignedVector<DeltaType>(deltas_raw);
+    bytes_ptr += col_deltas_size * sizeof(DeltaType);
+    std::vector<int> nnz_raw(nnz_per_row_size);
+    memcpy(nnz_raw.data(), bytes_ptr, nnz_per_row_size * sizeof(int));
+    nnz_per_row_ = CacheAlignedVector<int>(nnz_raw);
+    num_threads_ = 0;
+    PrepareForThreads(num_threads);
+  }
+  // Multiply a Sparse matrix by a possibly dense matrix.  Often the matrix is
+  // a vector with a small number of columns, hence the term "fat vector".
+  // 1x1 and 4x4 have specializations for output columns (ie fatness) > 5,
+  // and often achieve twice as many GFlops when multiplying a right hand side
+  // that has 5 or more columns.  (Best is a multiple of 5).
+  // 16x1 doesn't have enough registers and just loops over the width 1 kernel.
+  //
+  // |rhs| and |out| are COLUMN MAJOR.
+  // Fast Tuples WeightType, BiasType, RhsType, OutType are:
+  // (float, float, float, float)
+  // (bfloat16, float, float, float)
+  // and only on ARM64.  All other cases use a slow generic implementation.
+  template <typename RhsClass, typename BiasClass, typename OutClass,
+            typename BiasType = typename BiasClass::value_type,
+            typename OutType = typename OutClass::value_type>
+  void SpMM_bias(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
+                 bool relu = false, int tid = 0,
+                 SpinBarrier* barrier = nullptr) const {
+    static_assert(std::is_same<typename RhsClass::value_type, RhsType>::value,
+                  "Rhs types must match");
+    CHECK_LT(tid, num_threads_);
+    CHECK_EQ(rhs.cols(), out->cols());
+    CHECK_EQ(rhs.rows(), cols_);
+    CHECK_GE(out->rows(), rows_);
+    int cols_to_go = out->cols();
+    int rhs_index = *thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid);
+    const RhsType* rhs_ptr = rhs.data() + rhs_index * block_height_;
+    OutType* out_ptr = thread_bounds_.OffsetOutput(out->data(), tid);
+    const WeightType* weights_ptr =
+        thread_bounds_.OffsetWeights(weights_.data(), tid);
+    const DeltaType* delta_ptr =
+        thread_bounds_.OffsetRhsIndices(col_deltas_.data(), tid);
+    int offset = *delta_ptr / sizeof(RhsType);
+    rhs_ptr -= offset;
+    const int* nnz_ptr = nnz_per_row_.data() + thread_bounds_.StartRow(tid);
+    int assigned_rows =
+        thread_bounds_.StartRow(tid + 1) - thread_bounds_.StartRow(tid);
+    const BiasType* bias_ptr = thread_bounds_.OffsetBias(bias.data(), tid);
+    while (cols_to_go > 0) {
+      if (block_width_ == 4 && block_height_ == 4) {
+        if (cols_to_go >= 5) {
+          detail::SpMM5_4x4<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        } else {
+          detail::SpMV_4x4<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        }
+      } else {
+        if (cols_to_go >= 5) {
+          detail::SpMM5_1x1<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        } else {
+          detail::SpMV_1x1<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        }
+      }
+      if (cols_to_go >= 5) {
+        cols_to_go -= 5;
+        rhs_ptr += rhs.col_stride() * 5;
+        out_ptr += out->col_stride() * 5;
+      } else {
+        cols_to_go--;
+        rhs_ptr += rhs.col_stride();
+        out_ptr += out->col_stride();
+      }
+      if (barrier) barrier->barrier();
+    }
+  }
+  template <typename MVRhsType, typename MVBiasType, typename OutType>
+  void MatVec(const MVRhsType* rhs, const MVBiasType* bias, bool relu, int tid,
+              int replicas, int output_stride, OutType* output) {
+    CHECK_LT(tid, num_threads_);
+    CHECK_EQ(block_width_, 4) << "Block width must be 4!";
+    if (block_height_ == 8) {
+      matmul_.MatVec8x4(
+          thread_bounds_.OffsetWeights(weights_.cast_data(), tid), rhs,
+          thread_bounds_.OffsetBias(bias, tid), nnz_per_row_.data(),
+          thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid),
+          thread_bounds_.StartRow(tid), thread_bounds_.StartRow(tid + 1), relu,
+          replicas, output_stride, thread_bounds_.OffsetOutput(output, tid));
+    } else {
+      CHECK_EQ(block_height_, 4) << "Block height must be 4 or 8!";
+      matmul_.MatVec4x4(
+          thread_bounds_.OffsetWeights(weights_.cast_data(), tid), rhs,
+          thread_bounds_.OffsetBias(bias, tid), nnz_per_row_.data(),
+          thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid),
+          thread_bounds_.StartRow(tid), thread_bounds_.StartRow(tid + 1), relu,
+          replicas, output_stride, thread_bounds_.OffsetOutput(output, tid));
+    }
+  }
+  int rows() const { return rows_; }
+  int cols() const { return cols_; }
+  int block_height() const { return block_height_; }
+  int block_width() const { return block_width_; }
+  float sparsity() const { return sparsity_; }
+  int num_threads() const { return num_threads_; }
+  const ThreadBounds& thread_bounds() const { return thread_bounds_; }
+  const CacheAlignedVector<DeltaType>& rhs_indices() const {
+    return rhs_indices_;
+  }
+  const std::string& name() const { return name_; }
+  void set_name(const std::string& name) { name_ = name; }
+  const std::vector<int>& split_points() const {
+    return thread_bounds_.row_starts();
+  }
+  std::size_t bytes() const {
+    return weights_.size() * sizeof(WeightType) +
+           col_deltas_.size() * sizeof(DeltaType) +
+           nnz_per_row_.size() * sizeof(int);
+  }
+  // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
+  // and then samples from the output (softmax distribution) layer.
+  template <typename RhsClass, typename BiasClass, typename OutClass,
+            typename BiasType = typename BiasClass::value_type,
+            typename OutType = typename OutClass::value_type>
+  typename std::enable_if<!IsFixed32Type<OutType>::value, int>::type
+  SpMM_bias_Sample(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
+                   float temperature, int tid, SpinBarrier* barrier,
+                   std::minstd_rand* gen,
+                   CacheAlignedVector<float>* scratch) const {
+    SpMM_bias(rhs, bias, out, /*relu=*/false, tid, barrier);
+    return out->Sample(temperature, gen, scratch);
+  }
+  // Fixed32 version.
+  template <typename RhsClass, typename BiasClass, typename OutClass,
+            typename BiasType = typename BiasClass::value_type,
+            typename OutType = typename OutClass::value_type>
+  typename std::enable_if<IsFixed32Type<OutType>::value, int>::type
+  SpMM_bias_Sample(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
+                   float temperature, int tid, SpinBarrier* barrier,
+                   std::minstd_rand* gen,
+                   CacheAlignedVector<float>* scratch) const {
+    // We don't pass the barrier on, as we have more work to do.
+    SpMM_bias(rhs, bias, out, /*relu=*/false, tid);
+    return out->ReducingSample(gen, scratch, tid, temperature, barrier);
+  }
+  void Print() const {
+    std::cout << "Weights\n";
+    weights_.Print();
+    std::cout << std::endl;
+    std::cout << "Deltas\n";
+    col_deltas_.Print();
+    std::cout << std::endl;
+    std::cout << "nnz\n";
+    nnz_per_row_.Print();
+    std::cout << std::endl;
+  }
+  // Split the computation amongst threads by rows based on the number of
+  // non zeros, with the addition of a constant to account for the work of the
+  // bias and the horizontal add at the end, and also guarantees that each
+  // thread writes only whole cache lines, based on the size of OutType.
+  // The |cache_line_size| arg is used only for testing. Normally it is provided
+  // through the architecture #defines.
+  // Each thread gets a contiguous row range (|split_points|).
+  // Thread t does rows [ split_points[t], split_points[t + 1] )
+  // Each thread also needs to know how many non zeros were before it to skip
+  // (|nnz_to_skip|).  And finally it also needs to know what the offset into
+  // the rhs vector would have been at the split point (|rhs_to_skip|).
+  //
+  // Some tricky corner cases where the number of non-zeros doesn't split
+  // nicely amongst the number of requested threads are not handled and default
+  // to one thread; these cases are only going to happen in tests and not in
+  // the matrices that correspond in real models.
+  //
+  // Returns the maximum number of threads that can be used; <= |num_threads|.
+  template <typename OutType = int32_t>
+  int PrepareForThreads(int num_threads, int cache_line_size = -1) {
+    CHECK_GT(num_threads, 0);
+    // we've already prepared for this number of threads, nothing to do
+    if (num_threads == num_threads_) return num_threads_;
+    num_threads_ = num_threads;
+    thread_bounds_.PrepareForThreads(
+        block_width_, block_height_, num_threads_,
+        ReducedRowsPerCacheLine<OutType>(cache_line_size), reduced_rows_,
+        nnz_per_row_.data());
+    return num_threads_;
+  }
+  // Computes and stores the |rhs_indices_| from the |col_deltas_|.
+  void ComputeRHSIndices() {
+    std::vector<int> cumulative_deltas = CumulativeColDeltas();
+    std::vector<DeltaType> rhs_indices(cumulative_deltas.size() +
+                                       reduced_rows_);
+    int total_indices = 0;
+    int delta_index = 0;
+    for (int r = 0; r < reduced_rows_; ++r) {
+      for (int n = 0; n < nnz_per_row_[r]; ++n, ++delta_index) {
+        rhs_indices[total_indices++] =
+            cumulative_deltas[delta_index] / block_width_;
+      }
+    }
+    rhs_indices_ = CacheAlignedVector<DeltaType>(rhs_indices);
+  }
+  // Computes and stores the |col_deltas_| from the |rhs_indices_|.
+  void ComputeColDeltas() {
+    std::vector<int> col_deltas(rhs_indices_.size());
+    int prev_index = 0;
+    for (int i = 0; i < rhs_indices_.size(); ++i) {
+      int offset = rhs_indices_[i] - prev_index;
+      prev_index = rhs_indices_[i];
+      col_deltas[i] = offset * block_width_ * sizeof(RhsType);
+    }
+    col_deltas_ = CacheAlignedVector<DeltaType>(col_deltas);
+  }
+  // Computes and returns the inclusive prefix sum of the deltas, ie absolute
+  // positions.
+  std::vector<int> CumulativeColDeltas() const {
+    std::vector<int> cum_col_deltas(col_deltas_.size());
+    for (int i = 0; i < col_deltas_.size(); ++i) {
+      cum_col_deltas[i] = col_deltas_[i] / sizeof(RhsType);
+      if (i > 0) cum_col_deltas[i] += cum_col_deltas[i - 1];
+    }
+    return cum_col_deltas;
+  }
+ private:
+  constexpr std::size_t FixedParameterSize() const {
+    return sizeof(int)      // rows
+           + sizeof(int)    // cols
+           + sizeof(int)    // reduced_rows
+           + sizeof(int)    // reduced_cols
+           + sizeof(int)    // block_width
+           + sizeof(int)    // block_height
+           + sizeof(float)  // sparsity
+           + sizeof(int)    // col_multiple
+           + sizeof(int)    // num_threads_
+           + sizeof(int)    // weights_.size()
+           + sizeof(int)    // col_deltas_.size()
+           + sizeof(int);   // nnz_per_row_.size()
+  }
+  // Possible block sizes are only those that are supported by the computation
+  // default is 1x1, other options are 4x4 and 16x1.
+  template <typename InputType>
+  void DetermineBlockSize(const MaskedSparseMatrix<InputType>& masked_matrix) {
+    const std::vector<std::pair<int, int>> kPreferredOrder = {{4, 4}};
+    int rows = masked_matrix.rows();
+    int cols = masked_matrix.cols();
+    for (const auto& block_size : kPreferredOrder) {
+      int block_height, block_width;
+      std::tie(block_height, block_width) = block_size;
+      if (cols % block_width != 0) continue;
+      int reduced_rows = (rows + block_height - 1) / block_height;
+      int reduced_cols = cols / block_width;
+      // For each possible block, confirm that it is either all 0s or all 1s.
+      bool all_same = true;
+      const auto& mask = masked_matrix.mask();
+      for (int r = 0; r < reduced_rows; ++r) {
+        for (int c = 0; c < reduced_cols; ++c) {
+          int val = mask[r * block_height * cols + c * block_width];
+          for (int i = 0; i < block_height; ++i) {
+            for (int j = 0; j < block_width; ++j) {
+              int index = (r * block_height + i) * cols + c * block_width + j;
+              if (index < masked_matrix.mask().size()) {
+                all_same &= (masked_matrix.mask()[index] == val);
+              }
+            }
+          }
+        }
+      }
+      // If this block configuration is possible, accept it.
+      if (all_same) {
+        block_height_ = block_height;
+        block_width_ = block_width;
+        return;
+      }
+    }
+    // No large blocks were found, default to 1x1.
+    block_height_ = 1;
+    block_width_ = 1;
+  }
+  // CSR descriptors are for the reduced matrix, weights is the full matrix.
+  template <typename InputType>
+  void MakeColumnsMultiple(const std::vector<int>& row_offsets,
+                           std::vector<int>* reduced_mask,
+                           std::vector<InputType>* weights) {
+    if (col_multiple_ > 0) {
+      // Make sure each row has a number of columns that is a multiple of
+      // |col_multiple|.
+      for (int r = 1; r < row_offsets.size(); ++r) {
+        int num_row = row_offsets[r] - row_offsets[r - 1];
+        int num_needed = col_multiple_ - num_row % col_multiple_;
+        if (num_needed < col_multiple_) {
+          // Find gaps in the columns where we can insert a column of 0 weights.
+          int num_added = 0;
+          for (int c = 0; c < reduced_cols_; ++c) {
+            if ((*reduced_mask)[(r - 1) * reduced_cols_ + c] == 0) {
+              (*reduced_mask)[(r - 1) * reduced_cols_ + c] = 1;
+              // Zero out the weights that correspond to this block.
+              for (int i = 0; i < block_height_; ++i) {
+                for (int j = 0; j < block_width_; ++j) {
+                  (*weights)[((r - 1) * block_height_ + i) * cols_ +
+                             block_width_ * c + j] = InputType(0.f);
+                }
+              }
+              num_added++;
+            }
+            if (num_added == num_needed) break;
+          }
+        }
+      }
+    }
+  }
+  // Given the final dense mask and weights, convert to the compressed
+  // block CSR representation.
+  template <typename InputType>
+  void MaskAndWeightsToCsr(const std::vector<int>& mask,
+                           const std::vector<InputType>& weights,
+                           std::vector<int>* nnz_per_row,
+                           std::vector<int>* col_indices,
+                           std::vector<WeightType>* weights_csr) {
+    std::vector<int> row_offsets = {0};
+    int nnz = 0;
+    // Standard CSR format.
+    if (block_width_ == 1 && block_height_ == 1) {
+      for (int r = 0; r < rows_; ++r) {
+        for (int c = 0; c < cols_; ++c) {
+          if (mask[r * cols_ + c] == 1) {
+            nnz++;
+            col_indices->push_back(c);
+            weights_csr->push_back(WeightType(weights[r * cols_ + c]));
+          }
+        }
+        row_offsets.push_back(nnz);
+      }
+    } else if (block_width_ == 4 && block_height_ == 4) {
+      // Weights are stored contiguously for each block in this case.
+      for (int r = 0; r < reduced_rows_; ++r) {
+        for (int c = 0; c < reduced_cols_; ++c) {
+          if (mask[r * reduced_cols_ + c] == 1) {
+            col_indices->push_back(c);
+            nnz++;
+            for (int i = 0; i < block_height_; ++i) {
+              for (int j = 0; j < block_width_; ++j) {
+                int row_index = (block_height_ * r + i) * cols_;
+                int w_index = row_index + block_width_ * c + j;
+                WeightType weight = w_index < weights.size()
+                                        ? WeightType(weights[w_index])
+                                        : WeightType(0.0f);
+                weights_csr->push_back(weight);
+              }
+            }
+          }
+        }
+        row_offsets.push_back(nnz);
+      }
+    }
+    for (int i = 1; i < row_offsets.size(); ++i)
+      nnz_per_row->push_back(row_offsets[i] - row_offsets[i - 1]);
+  }
+  // Returns the number of block rows per cache line. This is the minimum unit
+  // into which the calculation is broken for threads.
+  template <typename OutType>
+  int ReducedRowsPerCacheLine(int override_cache_line_size = -1) const {
+    int line_size = kCacheLineSize;
+    if (override_cache_line_size >= 1) line_size = override_cache_line_size;
+    return std::max<int>(line_size / (block_height_ * sizeof(OutType)), 1);
+  }
+  int col_multiple_;
+  int rows_;
+  int cols_;
+  int reduced_rows_;
+  int reduced_cols_;
+  float sparsity_;
+  int block_width_;
+  int block_height_;
+  int num_threads_;
+  std::string name_;
+  CacheAlignedVector<WeightType> weights_;
+  CacheAlignedVector<DeltaType> col_deltas_;
+  CacheAlignedVector<int> nnz_per_row_;
+  // |thread_bounds_| and |rhs_indices_| don't need to be serialized as they are
+  // always recalculated from serialized data.
+  CacheAlignedVector<DeltaType> rhs_indices_;
+  Matmul<WeightType, RhsType> matmul_;
+  ThreadBounds thread_bounds_;
+  static constexpr int kCacheLineSize = 64;
+};
+// Converts a sparse matrix represented with (|mask|, |weights|, |size|) into
+// the CSR format, and returns that as a serialized string.
+template <typename MaskType>
+std::string ConvertDenseToSparseRepresentation_Int16Deltas(
+    const std::vector<MaskType>& mask, const std::vector<float>& weights,
+    const int rows, const int cols) {
+  MaskedSparseMatrix<float> masked_weights(rows, cols, mask.data(),
+                                           weights.data());
+  CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
+      sparse_masked_weights(masked_weights);
+  std::string buffer;
+  sparse_masked_weights.WriteToFlatBuffer(&buffer);
+  return buffer;
+}
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_

sparse_matmul/layers/csrblocksparse_test.cc ADDED Viewed

	@@ -0,0 +1,977 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <array>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+// Placeholder for get runfiles header.
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "gtest/gtest.h"
+#include "include/ghc/filesystem.hpp"
+#include "sparse_matmul/compute/matmul.h"
+#include "sparse_matmul/layers/utils.h"
+#include "sparse_matmul/numerics/test_utils.h"
+#include "sparse_matmul/os/coop_threads.h"
+namespace csrblocksparse {
+namespace {
+inline constexpr absl::string_view kTestdataPath = "layers/testdata";
+TEST(CSRBlockSparseMatrix, FlatBufferSerialization) {
+  const int kRows = 8;
+  const int kCols = 8;
+  std::vector<int> mask = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                           1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                           0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
+  std::vector<float> values(kRows * kCols, 1.f);
+  values[1] = 2.f;
+  values[3] = 3.f;
+  values[36] = -1.f;
+  values[45] = -2.f;
+  csrblocksparse::CacheAlignedVector<float> bias(kRows);
+  csrblocksparse::CacheAlignedVector<float> rhs(kCols);
+  csrblocksparse::CacheAlignedVector<float> out_ref(kRows);
+  csrblocksparse::CacheAlignedVector<float> out_test(kRows);
+  bias.FillZero();
+  rhs.FillOnes();
+  csrblocksparse::MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(),
+                                                   values.data());
+  matrix.SpMM_bias(rhs, bias, &out_ref);
+  csrblocksparse::CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
+      block_sparse_matrix(matrix);
+  std::string buffer;
+  std::size_t num_bytes = block_sparse_matrix.WriteToFlatBuffer(&buffer);
+  csrblocksparse::CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
+      new_block_sparse_matrix(reinterpret_cast<const uint8_t*>(buffer.c_str()),
+                              num_bytes);
+  new_block_sparse_matrix.SpMM_bias(rhs, bias, &out_test);
+  CheckResult(out_ref, out_test, kCols);
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void CorrectnessCheckBlockSpMM(int rows, int cols, int block_height,
+                               int block_width, float sparsity,
+                               bool use_relu = false, int num_threads = 1,
+                               int fatness = 1, bool test_matmul = false) {
+  using BiasType = typename TypeOfProduct<ComputeType, RhsType>::type;
+  MaskedSparseMatrix<float> matrix(rows, cols, sparsity, block_height,
+                                   block_width);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(cols, fatness);
+  CacheAlignedVector<BiasType> bias(rows);
+  FatCacheAlignedVector<OutType> out(rows, fatness);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out.FillZero();
+  FatCacheAlignedVector<OutType> out_reference = out;
+  matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  num_threads = sparse_linear_layer.PrepareForThreads(num_threads);
+  // Checks that the result of applying each thread's portion serially is
+  // correct.
+  for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
+    sparse_linear_layer.SpMM_bias(rhs, &out, use_relu, thread_id);
+  }
+  CheckResult(out_reference, out, sparse_linear_layer.cols());
+  if (test_matmul) {
+    for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
+      sparse_linear_layer.MatVec(rhs, use_relu, thread_id,
+                                 /*replicas=*/1, /*output_stride=*/0, &out);
+    }
+    CheckResult(out_reference, out, sparse_linear_layer.cols());
+  }
+}
+// Does:
+// y = Ax + b;
+// x = Ay + b;
+// y = Ax + b;
+//
+// to make sure that dependent multiplies are correct.
+template <typename ComputeType, typename RhsType, typename OutType>
+void ThreadBody(
+    SpinBarrier* spin_barrier, int tid,
+    const SparseLinearLayer<ComputeType, RhsType>& sparse_linear_layer,
+    FatCacheAlignedVector<RhsType>* rhs, FatCacheAlignedVector<OutType>* out,
+    bool use_relu) {
+  sparse_linear_layer.SpMM_bias(*rhs, out, use_relu, tid);
+  spin_barrier->barrier();
+  sparse_linear_layer.SpMM_bias(*out, rhs, use_relu, tid);
+  spin_barrier->barrier();
+  sparse_linear_layer.SpMM_bias(*rhs, out, use_relu, tid);
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void CorrectnessCheckBlockSpMM_MultiThread(int rows, int cols, int block_height,
+                                           int block_width, float sparsity,
+                                           bool use_relu = false,
+                                           int num_threads = 1,
+                                           int fatness = 1) {
+  typedef typename TypeOfProduct<ComputeType, RhsType>::type BiasType;
+  CHECK(rows == cols);
+  MaskedSparseMatrix<float> matrix(rows, cols, sparsity, block_height,
+                                   block_width);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(cols, fatness);
+  FatCacheAlignedVector<RhsType> rhs_mt(cols, fatness);
+  CacheAlignedVector<BiasType> bias(rows);
+  FatCacheAlignedVector<OutType> out(rows, fatness);
+  bias.FillOnes();
+  rhs.FillOnes();
+  rhs_mt.FillOnes();
+  out.FillZero();
+  FatCacheAlignedVector<OutType> out_reference = out;
+  matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
+  matrix.SpMM_bias(out_reference, bias, &rhs, use_relu);
+  matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  num_threads = sparse_matrix.PrepareForThreads(num_threads,
+                                                /*cache_line_size=*/1);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  csrblocksparse::LaunchOnThreadsWithBarrier(
+      num_threads, ThreadBody<ComputeType, RhsType, OutType>,
+      sparse_linear_layer, &rhs_mt, &out, use_relu);
+  CheckResult(out_reference, out, cols);
+}
+}  // namespace
+TEST(MaskedSparseCorrectness, HandCoded) {
+  const int kRows = 8;
+  const int kCols = 8;
+  // clang-format off
+  std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
+                           0, 1, 0, 1, 0, 1, 0, 1,
+                           1, 0, 0, 1, 1, 1, 1, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0,
+                           1, 1, 1, 1, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 0, 0,
+                           1, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 0, 1, 0, 1};
+  // clang-format on
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
+  MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
+  CacheAlignedVector<float> rhs(kCols);
+  CacheAlignedVector<float> bias(kRows);
+  CacheAlignedVector<float> out(kRows);
+  bias.FillOnes();
+  rhs.FillOnes();
+  out.FillZero();
+  MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
+                                               std::move(bias));
+  masked_linear_layer.SpMM_bias(rhs, &out);
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_EQ(answer[i], out[i]);
+  }
+}
+TEST(MaskedSparseCorrectness, HandCodedFatVector) {
+  const int kRows = 8;
+  const int kCols = 8;
+  // clang-format off
+  std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
+                           0, 1, 0, 1, 0, 1, 0, 1,
+                           1, 0, 0, 1, 1, 1, 1, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0,
+                           1, 1, 1, 1, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 0, 0,
+                           1, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 0, 1, 0, 1};
+  // clang-format on
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
+  MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
+  const int kMaxWidth = 5;
+  for (int width = 5; width <= kMaxWidth; ++width) {
+    FatCacheAlignedVector<float> rhs(kCols, width);
+    CacheAlignedVector<float> bias(kRows);
+    FatCacheAlignedVector<float> out(kRows, width);
+    bias.FillOnes();
+    rhs.FillOnes();
+    out.FillZero();
+    MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
+                                                 std::move(bias));
+    masked_linear_layer.SpMM_bias(rhs, &out);
+    for (int i = 0; i < kRows; ++i) {
+      for (int width = 0; width < kMaxWidth; ++width) {
+        EXPECT_EQ(answer[i], out[i + width * kRows]);
+      }
+    }
+  }
+}
+TEST(CsrBlockSparseMatrix, HandCodedMultiThread) {
+  const int kRows = 8;
+  const int kCols = 8;
+  // clang-format off
+  std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
+                           0, 1, 0, 1, 0, 1, 0, 1,
+                           1, 0, 0, 1, 1, 1, 1, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0,
+                           1, 1, 1, 1, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 0, 0,
+                           1, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 0, 1, 0, 1};
+  // clang-format on
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
+  MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
+  CacheAlignedVector<float> rhs(kCols);
+  CacheAlignedVector<float> bias(kRows);
+  CacheAlignedVector<float> out(kRows);
+  bias.FillOnes();
+  rhs.FillOnes();
+  out.FillZero();
+  CacheAlignedVector<float> bias_csr = bias;
+  CsrBlockSparseMatrix<bfloat16, float> sparse_matrix(matrix);
+  MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
+                                               std::move(bias));
+  masked_linear_layer.SpMM_bias(rhs, &out);
+  SparseLinearLayer<bfloat16, float> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias_csr));
+  sparse_linear_layer.PrepareForThreads(2, /*cache_line_size=*/1);
+  CacheAlignedVector<float> out_tmp(kRows);
+  const bool kUseRelu = false;
+  sparse_linear_layer.SpMM_bias(rhs, &out_tmp, kUseRelu, /*tid=*/0);
+  sparse_linear_layer.SpMM_bias(rhs, &out_tmp, kUseRelu, /*tid=*/1);
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_EQ(answer[i], out_tmp[i]);
+  }
+}
+TEST(TestCasts, TestBfloat16) {
+  const int kRows = 1000;
+  const int kCols = 100;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+  MaskedSparseMatrix<float> matrix_bfloat16(kRows, kCols, matrix.mask().data(),
+                                            matrix.values().data());
+  matrix_bfloat16.CastWeights<bfloat16>();
+  CheckResult(matrix.values(), matrix_bfloat16.values(), kCols);
+}
+TEST(TestCasts, TestFP16) {
+  const int kRows = 1000;
+  const int kCols = 100;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+#if !defined __arm__ && !defined __aarch64__
+  // Conversion doesn't handle denormals, so flush denormals to zero first.
+  for (int i = 0; i < matrix.values().size(); ++i) {
+    if (matrix.data()[i] < 1. / static_cast<float>(1 << 14))
+      matrix.data()[i] = 0.f;
+  }
+#endif
+  MaskedSparseMatrix<float> matrix_fp16(kRows, kCols, matrix.mask().data(),
+                                        matrix.values().data());
+  matrix_fp16.CastWeights<csrblocksparse::fp16>();
+  CheckResult(matrix.values(), matrix_fp16.values(), kCols);
+}
+TEST(TestCasts, TestFixed16) {
+  const int kRows = 100000;
+  const int kCols = 1;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+  // Relative error for fixed point is high near 0.
+  for (int i = 0; i < matrix.values().size(); ++i) {
+    // 1.1e-3 is based on the max error of .013 and a grid spacing of 1 / 2**16
+    // == 3e-5.  3e-5 / .013 / 2 = 1.1e-3.
+    if (std::abs(matrix.data()[i]) < 1.1e-3) {
+      matrix.data()[i] = 0.f;
+    }
+  }
+  MaskedSparseMatrix<float> matrix_fixed16 = matrix;
+  matrix_fixed16.CastWeights<csrblocksparse::fixed16</*ExponentBits=*/0>>();
+  CheckResult(matrix.values(), matrix_fixed16.values(), kCols);
+}
+TEST(TestCasts, TestFixed32) {
+  const int kRows = 100000;
+  const int kCols = 1;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+  MaskedSparseMatrix<float> matrix_fixed32 = matrix;
+  matrix_fixed32.CastWeights<csrblocksparse::fixed32</*ExponentBits=*/0>>();
+  CheckResult(matrix.values(), matrix_fixed32.values(), kCols);
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void TestSpMM(int block_width, int block_height, int fatness,
+              bool test_matmul = false) {
+  std::array<bool, 2> use_relu = {false, true};
+  std::vector<float> sparsity_levels = {.5, .8, .9, .95, .98};
+  std::vector<std::pair<int, int>> sizes = {{8, 8},     {128, 128}, {128, 64},
+                                            {256, 192}, {512, 512}, {1024, 512},
+                                            {384, 384}, {512, 384}};
+  for (int num_threads = 1; num_threads < 2 + test_matmul; ++num_threads) {
+    for (const auto& relu : use_relu) {
+      for (const auto& sparsity : sparsity_levels) {
+        for (const auto& size : sizes) {
+          int rows, cols;
+          std::tie(rows, cols) = size;
+          CorrectnessCheckBlockSpMM<ComputeType, RhsType, OutType>(
+              rows, cols, block_height, block_width, sparsity, relu,
+              num_threads, fatness, test_matmul);
+        }
+      }
+    }
+  }
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void TestSpMM_MultiThread(int block_width, int block_height, int fatness) {
+  std::array<bool, 2> use_relu = {false, true};
+  std::vector<float> sparsity_levels = {.5, .8, .9, .95, .98};
+  std::vector<std::pair<int, int>> sizes = {
+      {48, 48}, {128, 128}, {512, 512}, {384, 384}};
+  for (int num_threads = 1; num_threads < 5; ++num_threads) {
+    for (const auto& relu : use_relu) {
+      for (const auto& sparsity : sparsity_levels) {
+        for (const auto& size : sizes) {
+          int rows, cols;
+          std::tie(rows, cols) = size;
+          CorrectnessCheckBlockSpMM_MultiThread<ComputeType, RhsType, OutType>(
+              rows, cols, block_height, block_width, sparsity, relu,
+              num_threads, fatness);
+        }
+      }
+    }
+  }
+}
+template <typename DataType>
+void TestSumVectors(int start = 0, int end = -1, int size = 6) {
+  std::vector<DataType> values;
+  std::vector<DataType> answer;
+  for (int i = 1; i < size + 1; ++i) {
+    const float x = static_cast<float>(i);
+    values.push_back(static_cast<DataType>(x));
+    answer.push_back(static_cast<DataType>(x * 2));
+  }
+  if (end == -1) {
+    end = values.size();
+  }
+  csrblocksparse::CacheAlignedVector<DataType> result(values.size());
+  csrblocksparse::CacheAlignedVector<DataType> values_aligned(values);
+  detail::SumVectors(start, end, values_aligned.data(), values_aligned.data(),
+                     result.data());
+  for (int i = start; i < end; ++i) {
+    EXPECT_EQ(static_cast<float>(answer[i]), static_cast<float>(result[i]));
+  }
+}
+TEST(CsrBlockSparseMatrix, SumVectors_Generic) {
+  TestSumVectors<float>();
+  TestSumVectors<float>(1);
+  TestSumVectors<float>(1, 4);
+}
+TEST(CsrBlockSparseMatrix, SumVectors_Bfloat16) {
+  TestSumVectors<csrblocksparse::bfloat16>();
+  TestSumVectors<csrblocksparse::bfloat16>(1);
+  TestSumVectors<csrblocksparse::bfloat16>(1, 4);
+}
+// For SIMD-optimized SumVectors, the memory of the vector should be at least
+// |kSIMDWidth * sizeof(float)| long, and the start position has to be an
+// aligned memory location. So setting |size| to be 100 to be safe and
+// |start| to be 0 (|start| == 1 is not aligned).
+TEST(CsrBlockSparseMatrix, SumVectors_Fixed16) {
+  TestSumVectors<csrblocksparse::fixed16<8>>(0, -1, 100);
+  TestSumVectors<csrblocksparse::fixed16<8>>(0, 4, 100);
+}
+TEST(CsrBlockSparseMatrix, SumVectors_Fixed32) {
+  TestSumVectors<csrblocksparse::fixed32<11>>(0, -1, 100);
+  TestSumVectors<csrblocksparse::fixed32<11>>(0, 4, 100);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_Bfloat16) {
+  TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/4,
+                                                   /*block_height=*/4,
+                                                   /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_4x4MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_4x4MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_Bfloat16) {
+  TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/1,
+                                                   /*block_height=*/1,
+                                                   /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_Bfloat16) {
+  TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/1,
+                                                   /*block_height=*/1,
+                                                   /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_1x1MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_1x1MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_float) {
+  TestSpMM<float, float, float>(/*block_width=*/4,
+                                /*block_height=*/4,
+                                /*fatness=*/1,
+                                /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_float) {
+  TestSpMM<float, float, float>(/*block_width=*/4,
+                                /*block_height=*/4,
+                                /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_4x4MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/4,
+                                            /*block_height=*/4,
+                                            /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_4x4MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/4,
+                                            /*block_height=*/4,
+                                            /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_float) {
+  TestSpMM<float, float, float>(/*block_width=*/1,
+                                /*block_height=*/1,
+                                /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_float) {
+  TestSpMM<float, float, float>(/*block_width=*/1,
+                                /*block_height=*/1,
+                                /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_1x1MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/1,
+                                            /*block_height=*/1,
+                                            /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_1x1MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/1,
+                                            /*block_height=*/1,
+                                            /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1,
+      /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1,
+      /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1,
+      /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, RhsIndicesDeltasRoundTrip) {
+  MaskedSparseMatrix<float> matrix(/*rows=*/256, /*cols=*/256,
+                                   /*sparsity=*/0.9, /*block_height=*/4,
+                                   /*block_width=*/4);
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  CacheAlignedVector<int16_t> copy_indices = sparse_matrix.rhs_indices();
+  sparse_matrix.ComputeColDeltas();
+  sparse_matrix.ComputeRHSIndices();
+  // They get padded when created, so the newer one could be bigger.
+  EXPECT_LE(copy_indices.size(), sparse_matrix.rhs_indices().size());
+  for (int i = 0; i < copy_indices.size(); ++i) {
+    EXPECT_EQ(copy_indices[i], sparse_matrix.rhs_indices()[i]) << "i=" << i;
+  }
+}
+// Tests that a Layer that is split into 2 by columns (inputs) computes the same
+// result as the original layer.
+TEST(CsrBlockSparseMatrix, SplitByCol) {
+  int kRows = 1024;
+  int kCols = 1024;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, 0.95, /*block_height=*/4,
+                                   /*block_width=*/4);
+  FatCacheAlignedVector<float> rhs(kCols, /*cols=*/1);
+  CacheAlignedVector<float> bias(kRows);
+  FatCacheAlignedVector<float> out1(kRows, /*cols=*/1);
+  FatCacheAlignedVector<float> out2(kRows, /*cols=*/1);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  out2.FillZero();
+  FatCacheAlignedVector<float> out_reference = out1;
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
+                                                      std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.SpMM_bias(rhs, &out_reference, /*relu=*/false,
+                                /*tid=*/0);
+  // Split the layer into 2 parts.
+  SparseLinearLayer<float, float> part1, part2;
+  sparse_linear_layer.SplitInputs(&part1, &part2);
+  part1.PrepareForThreads(1);
+  part2.PrepareForThreads(1);
+  EXPECT_EQ(kRows, part1.rows());
+  EXPECT_EQ(kCols / 2, part1.cols());
+  EXPECT_EQ(kRows, part2.rows());
+  EXPECT_EQ(kCols / 2, part2.cols());
+  MutableVectorView<float> rhs1(&rhs, 0, kCols / 2);
+  MutableVectorView<float> rhs2(&rhs, kCols / 2, kCols / 2);
+  for (int i = 0; i < kCols / 2; ++i) {
+    EXPECT_FLOAT_EQ(rhs[i], rhs1.data()[i]);
+    EXPECT_FLOAT_EQ(rhs[i + kCols / 2], rhs2.data()[i]);
+  }
+  part1.SpMM_bias(rhs1, &out1, /*relu=*/false, /*tid=*/0);
+  part2.SpMM_bias(rhs2, &out2, /*relu=*/false, /*tid=*/0);
+  // Check that out1 + out2 = out_reference.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_NEAR(out_reference[i], out1[i] + out2[i], 2e-5)
+        << " i=" << i << " out1=" << out1[i] << " out2=" << out2[i];
+  }
+}
+// Tests that a Layer that is split into 2 by rows (outputs) computes the same
+// result as the original layer.
+TEST(CsrBlockSparseMatrix, SplitByRow) {
+  int kRows = 1024;
+  int kCols = 1024;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, 0.95, /*block_height=*/4,
+                                   /*block_width=*/4);
+  FatCacheAlignedVector<float> rhs(kCols, /*cols=*/1);
+  CacheAlignedVector<float> bias(kRows);
+  FatCacheAlignedVector<float> out1(kRows, /*cols=*/1);
+  FatCacheAlignedVector<float> out2(kRows, /*cols=*/1);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  out2.FillZero();
+  FatCacheAlignedVector<float> out_reference = out1;
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
+                                                      std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.SpMM_bias(rhs, &out_reference, /*relu=*/false,
+                                /*tid=*/0);
+  // Split the layer into 2 parts.
+  SparseLinearLayer<float, float> part1, part2;
+  sparse_linear_layer.SplitOutputs(&part1, &part2);
+  part1.PrepareForThreads(1);
+  part2.PrepareForThreads(1);
+  EXPECT_EQ(kRows / 2, part1.rows());
+  EXPECT_EQ(kCols, part1.cols());
+  EXPECT_EQ(kRows / 2, part2.rows());
+  EXPECT_EQ(kCols, part2.cols());
+  MutableVectorView<float> out2a(&out2, 0, kRows / 2);
+  MutableVectorView<float> out2b(&out2, kRows / 2, kRows / 2);
+  part1.SpMM_bias(rhs, &out2a, /*relu=*/false, /*tid=*/0);
+  part2.SpMM_bias(rhs, &out2b, /*relu=*/false, /*tid=*/0);
+  // Check that out2 = out_reference.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_NEAR(out_reference[i], out2[i], 2e-5)
+        << " i=" << i << " out1=" << out_reference[i] << " out2=" << out2[i];
+  }
+}
+TEST(CsrBlockSparseMatrix, MutableVectorView) {
+  const int kRows = 1024;
+  const int kCols = 1024;
+  const int kFatness = 2;
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<int> mask(kRows * kCols);
+  for (int i = 0; i < mask.size(); ++i) mask[i] = i % 2;
+  auto masked_matrix =
+      MaskedSparseMatrix<float>(kRows, kCols, mask.data(), values.data());
+  auto sparse_matrix = CsrBlockSparseMatrix<bfloat16, float>(masked_matrix);
+  FatCacheAlignedVector<float> x(kCols, kFatness);
+  x.FillOnes();
+  CacheAlignedVector<float> bias(kRows);
+  bias.FillZero();
+  // First check that we can use spans as output.  Split a multiplication
+  // into upper and lower halves times the full vector:
+  // ---------------  x   t
+  // |             |  x   t
+  // |             |  x   t
+  // ---------------    =
+  // |             |  x   b
+  // |             |  x   b
+  // ---------------  x   b
+  FatCacheAlignedVector<float> out(kRows, kFatness);
+  FatCacheAlignedVector<float> out_view(kRows, kFatness);
+  MutableVectorView<float> out_view_top(&out_view, 0, kRows / 2);
+  MutableVectorView<float> out_view_bottom(&out_view, kRows / 2, kRows / 2);
+  sparse_matrix.SpMM_bias(x, bias, &out);
+  auto masked_matrix_top =
+      MaskedSparseMatrix<float>(kRows / 2, kCols, mask.data(), values.data());
+  auto masked_matrix_bottom = MaskedSparseMatrix<float>(
+      kRows / 2, kCols, mask.data() + kRows * kCols / 2,
+      values.data() + kRows * kCols / 2);
+  auto sparse_matrix_top =
+      CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_top);
+  auto sparse_matrix_bottom =
+      CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_bottom);
+  sparse_matrix_top.SpMM_bias(x, bias, &out_view_top);
+  sparse_matrix_bottom.SpMM_bias(x, bias, &out_view_bottom);
+  CheckResult(out, out_view, kCols);
+  // Check that we can use a span as an input vector.  Multiply upper left
+  // portion of the matrix by the top half of the vector.
+  // ---------------
+  // |oooooo       |   x   q
+  // |oooooo       |   x   q
+  // |             |     =
+  // |             |
+  // ---------------
+  auto masked_matrix_quarter = MaskedSparseMatrix<float>(
+      kRows / 2, kCols / 2, mask.data(), values.data());
+  auto sparse_matrix_quarter =
+      CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_quarter);
+  MutableVectorView<float> x_top(&x, 0, kCols / 2);
+  FatCacheAlignedVector<float> out_correct(kRows / 2, /*cols=*/2);
+  for (int i = 0; i < kFatness * (kRows / 2); ++i) out_correct[i] = 256.f;
+  MutableVectorView<float> bias_top(&bias, 0, kRows / 2);
+  FatCacheAlignedVector<float> out_quarter(kRows / 2, kFatness);
+  sparse_matrix_quarter.SpMM_bias(x_top, bias_top, &out_quarter);
+  CheckResult(out_correct, out_quarter, kCols / 2);
+}
+namespace {
+bool skip_test(const absl::Status& status, absl::string_view msg) {
+  if (!status.ok()) {
+    LOG(INFO) << "Couldn't load " << msg << ", skipping test " << status;
+    return true;
+  }
+  return false;
+}
+}  // namespace
+TEST(CsrBlockSparseMatrix, ModelMatrices_Bfloat16) {
+  std::vector<std::string> names = {
+      "768_512_95_4x4_wavernn_gru_", "768_512_95_4x4_coarseproj_",
+      "768_512_95_4x4_coarselogit_", "768_512_95_4x4_fineproj_",
+      "768_512_95_4x4_finelogit_",   "lyra_conv1d_"};
+  const std::string kPath =
+#if defined __arm__ || defined __aarch64__
+      "/data/local/tmp/";
+#else
+      (ghc::filesystem::current_path() / kTestdataPath).string();
+#endif
+  for (auto& layer_name : names) {
+    SparseLinearLayer<bfloat16, float> sparse_linear_layer;
+    auto status = LoadSparseLayer<bfloat16, float>(layer_name, /*zipped=*/true,
+                                                   &sparse_linear_layer, kPath);
+    // If the files don't exist on the device we're running on, just skip this
+    // test and log that it was skipped.
+    if (skip_test(status, layer_name)) return;
+    int rows = sparse_linear_layer.rows();
+    int cols = sparse_linear_layer.cols();
+    MaskedLinearLayer<float> masked_linear_layer;
+    status = LoadMaskedLayer<float>(layer_name, /*zipped=*/true,
+                                    &masked_linear_layer, kPath);
+    if (skip_test(status, layer_name)) return;
+    masked_linear_layer.CastWeights<csrblocksparse::bfloat16>();
+    CacheAlignedVector<float> rhs(cols);
+    CacheAlignedVector<float> out_ref(rows);
+    CacheAlignedVector<float> out_spmv(rows);
+    rhs.FillRandom();
+    out_ref.FillZero();
+    out_spmv.FillZero();
+    std::array<bool, 2> use_relus = {false, true};
+    for (bool use_relu : use_relus) {
+      masked_linear_layer.SpMM_bias(rhs, &out_ref, use_relu);
+      sparse_linear_layer.SpMM_bias(rhs, &out_spmv, use_relu);
+      CheckResult(out_ref, out_spmv, cols);
+    }
+  }
+}
+TEST(CsrBlockSparseMatrix, ModelMatrices_float) {
+  std::vector<std::string> names = {
+      "768_512_95_4x4_wavernn_gru_", "768_512_95_4x4_coarseproj_",
+      "768_512_95_4x4_coarselogit_", "768_512_95_4x4_fineproj_",
+      "768_512_95_4x4_finelogit_",   "lyra_conv1d_"};
+  const std::string kPath =
+#if defined __arm__ || defined __aarch64__
+      "/data/local/tmp/";
+#else
+      (ghc::filesystem::current_path() / kTestdataPath).string();
+#endif
+  for (auto& layer_name : names) {
+    SparseLinearLayer<float, float> sparse_linear_layer;
+    auto status = LoadSparseLayer<float, float>(layer_name, /*zipped=*/true,
+                                                &sparse_linear_layer, kPath);
+    // If the files don't exist on the device we're running on, just skip this
+    // test and log that it was skipped.
+    if (skip_test(status, layer_name)) return;
+    int rows = sparse_linear_layer.rows();
+    int cols = sparse_linear_layer.cols();
+    MaskedLinearLayer<float> masked_linear_layer;
+    status = LoadMaskedLayer<float>(layer_name, /*zipped=*/true,
+                                    &masked_linear_layer, kPath);
+    if (skip_test(status, layer_name)) return;
+    CacheAlignedVector<float> rhs(cols);
+    CacheAlignedVector<float> out_ref(rows);
+    CacheAlignedVector<float> out_spmv(rows);
+    rhs.FillRandom();
+    out_ref.FillZero();
+    out_spmv.FillZero();
+    std::array<bool, 2> use_relus = {false, true};
+    for (bool use_relu : use_relus) {
+      masked_linear_layer.SpMM_bias(rhs, &out_ref, use_relu);
+      sparse_linear_layer.SpMM_bias(rhs, &out_spmv, use_relu);
+      CheckResult(out_ref, out_spmv, cols);
+    }
+  }
+}
+#undef SKIP_TEST
+}  // namespace csrblocksparse

sparse_matmul/layers/errno_mapping.cc ADDED Viewed

	@@ -0,0 +1,195 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/layers/errno_mapping.h"
+#include <string>
+#include "absl/strings/str_cat.h"
+namespace csrblocksparse {
+namespace {
+absl::StatusCode ErrnoToCode(int error_number) {
+  switch (error_number) {
+    case 0:
+      return absl::StatusCode::kOk;
+    case EINVAL:        // Invalid argument
+    case ENAMETOOLONG:  // Filename too long
+    case E2BIG:         // Argument list too long
+    case EDESTADDRREQ:  // Destination address required
+    case EDOM:          // Mathematics argument out of domain of function
+    case EFAULT:        // Bad address
+    case EILSEQ:        // Illegal byte sequence
+    case ENOPROTOOPT:   // Protocol not available
+    case ENOSTR:        // Not a STREAM
+    case ENOTSOCK:      // Not a socket
+    case ENOTTY:        // Inappropriate I/O control operation
+    case EPROTOTYPE:    // Protocol wrong type for socket
+    case ESPIPE:        // Invalid seek
+      return absl::StatusCode::kInvalidArgument;
+    case ETIMEDOUT:  // Connection timed out
+    case ETIME:      // Timer expired
+      return absl::StatusCode::kDeadlineExceeded;
+    case ENODEV:  // No such device
+    case ENOENT:  // No such file or directory
+#ifdef ENOMEDIUM
+    case ENOMEDIUM:  // No medium found
+#endif
+    case ENXIO:  // No such device or address
+    case ESRCH:  // No such process
+      return absl::StatusCode::kNotFound;
+    case EEXIST:         // File exists
+    case EADDRNOTAVAIL:  // Address not available
+    case EALREADY:       // Connection already in progress
+#ifdef ENOTUNIQ
+    case ENOTUNIQ:  // Name not unique on network
+#endif
+      return absl::StatusCode::kAlreadyExists;
+    case EPERM:   // Operation not permitted
+    case EACCES:  // Permission denied
+#ifdef ENOKEY
+    case ENOKEY:  // Required key not available
+#endif
+    case EROFS:  // Read only file system
+      return absl::StatusCode::kPermissionDenied;
+    case ENOTEMPTY:   // Directory not empty
+    case EISDIR:      // Is a directory
+    case ENOTDIR:     // Not a directory
+    case EADDRINUSE:  // Address already in use
+    case EBADF:       // Invalid file descriptor
+#ifdef EBADFD
+    case EBADFD:  // File descriptor in bad state
+#endif
+    case EBUSY:    // Device or resource busy
+    case ECHILD:   // No child processes
+    case EISCONN:  // Socket is connected
+#ifdef EISNAM
+    case EISNAM:  // Is a named type file
+#endif
+#ifdef ENOTBLK
+    case ENOTBLK:  // Block device required
+#endif
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
+#ifdef ESHUTDOWN
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
+#endif
+    case ETXTBSY:  // Text file busy
+#ifdef EUNATCH
+    case EUNATCH:  // Protocol driver not attached
+#endif
+      return absl::StatusCode::kFailedPrecondition;
+    case ENOSPC:  // No space left on device
+#ifdef EDQUOT
+    case EDQUOT:  // Disk quota exceeded
+#endif
+    case EMFILE:   // Too many open files
+    case EMLINK:   // Too many links
+    case ENFILE:   // Too many open files in system
+    case ENOBUFS:  // No buffer space available
+    case ENODATA:  // No message is available on the STREAM read queue
+    case ENOMEM:   // Not enough space
+    case ENOSR:    // No STREAM resources
+#ifdef EUSERS
+    case EUSERS:  // Too many users
+#endif
+      return absl::StatusCode::kResourceExhausted;
+#ifdef ECHRNG
+    case ECHRNG:  // Channel number out of range
+#endif
+    case EFBIG:      // File too large
+    case EOVERFLOW:  // Value too large to be stored in data type
+    case ERANGE:     // Result too large
+      return absl::StatusCode::kOutOfRange;
+#ifdef ENOPKG
+    case ENOPKG:  // Package not installed
+#endif
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
+#ifdef EPFNOSUPPORT
+    case EPFNOSUPPORT:  // Protocol family not supported
+#endif
+    case EPROTONOSUPPORT:  // Protocol not supported
+#ifdef ESOCKTNOSUPPORT
+    case ESOCKTNOSUPPORT:  // Socket type not supported
+#endif
+    case EXDEV:  // Improper link
+      return absl::StatusCode::kUnimplemented;
+    case EAGAIN:  // Resource temporarily unavailable
+#ifdef ECOMM
+    case ECOMM:  // Communication error on send
+#endif
+    case ECONNREFUSED:  // Connection refused
+    case ECONNABORTED:  // Connection aborted
+    case ECONNRESET:    // Connection reset
+    case EINTR:         // Interrupted function call
+#ifdef EHOSTDOWN
+    case EHOSTDOWN:  // Host is down
+#endif
+    case EHOSTUNREACH:  // Host is unreachable
+    case ENETDOWN:      // Network is down
+    case ENETRESET:     // Connection aborted by network
+    case ENETUNREACH:   // Network unreachable
+    case ENOLCK:        // No locks available
+    case ENOLINK:       // Link has been severed
+#ifdef ENONET
+    case ENONET:  // Machine is not on the network
+#endif
+      return absl::StatusCode::kUnavailable;
+    case EDEADLK:  // Resource deadlock avoided
+#ifdef ESTALE
+    case ESTALE:  // Stale file handle
+#endif
+      return absl::StatusCode::kAborted;
+    case ECANCELED:  // Operation cancelled
+      return absl::StatusCode::kCancelled;
+    default:
+      return absl::StatusCode::kUnknown;
+  }
+}
+// POSIX `strerror_r()` returns `int`.
+ABSL_ATTRIBUTE_UNUSED std::string StrErrorResult(int result, const char* buffer,
+                                                 int error_code) {
+  if (ABSL_PREDICT_FALSE(result != 0)) {
+    return absl::StrCat("Unknown error ", error_code);
+  }
+  return buffer;
+}
+// GNU `strerror_r()` returns `char*`.
+ABSL_ATTRIBUTE_UNUSED std::string StrErrorResult(char* result,
+                                                 const char* buffer,
+                                                 int error_code) {
+  return result;
+}
+std::string StrError(int error_code) {
+  char message[256];
+  return StrErrorResult(strerror_r(error_code, message, sizeof(message)),
+                        message, error_code);
+}
+}  // namespace
+absl::Status ErrnoToCanonicalStatus(int error_number,
+                                    absl::string_view message) {
+  return absl::Status(ErrnoToCode(error_number),
+                      absl::StrCat(message, ": ", StrError(error_number)));
+}
+}  // namespace csrblocksparse

sparse_matmul/layers/errno_mapping.h ADDED Viewed

	@@ -0,0 +1,29 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_ERRNO_MAPPING_H_
+#define THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_ERRNO_MAPPING_H_
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+namespace csrblocksparse {
+// Converts |error_number| value to absl::Status.
+absl::Status ErrnoToCanonicalStatus(int error_number,
+                                    absl::string_view message);
+}  // namespace csrblocksparse
+#endif  // THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_ERRNO_MAPPING_H_

sparse_matmul/layers/masked_sparse_matrix.h ADDED Viewed

	@@ -0,0 +1,206 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_MASKED_SPARSE_MATRIX_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_MASKED_SPARSE_MATRIX_H_
+#include <algorithm>
+#include <cstdio>
+#include <numeric>
+#include <vector>
+#include "absl/strings/str_format.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+namespace csrblocksparse {
+// MaskedSparseMatrix serves two purposes:
+// 1) It is useful as a reference implementation of SpMV for correctness
+//    checking the much more complicated implementations in CSRBlockSparseMatrix
+// 2) This is the format that sparse matrices are represented after pruning
+//    in TF.  This class provides a bridge to getting these parameters into
+//    a compressed form suitable for computation and serialization.
+//
+//  MaskedSparseMatrix<float> matrix(rows, cols, mask_from_tf, values_from_tf);
+//  CSRBlockSparseMatrix<float, bfloat16, int16_t> csr_matrix(matrix);
+//  csr_matrix.Multiply(rhs, bias, &out);
+template <typename T>
+class MaskedSparseMatrix {
+ public:
+  MaskedSparseMatrix() {}
+  // Construct a MaskedSparseMatrix of the given size, sparsity and block size.
+  // This is mainly useful for testing.
+  MaskedSparseMatrix(int rows, int cols, float sparsity, int block_height = 1,
+                     int block_width = 1, float constant = 1.f,
+                     bool random = true)
+      : rows_(rows), cols_(cols), sparsity_(sparsity) {
+    CHECK_EQ(rows % block_height, 0);
+    CHECK_EQ(cols % block_width, 0);
+    init(sparsity, block_height, block_width, constant, random);
+  }
+  // Construct from an existing mask and values (most likely from a TF model).
+  template <typename MaskType>
+  MaskedSparseMatrix(int rows, int cols, const MaskType* mask, const T* values)
+      : rows_(rows), cols_(cols) {
+    mask_.resize(rows * cols);
+    values_.resize(rows * cols);
+    std::copy_n(mask, rows * cols, mask_.begin());
+    std::copy_n(values, rows * cols, values_.begin());
+    sparsity_ =
+        1.f - std::accumulate(mask_.begin(), mask_.end(), 0.f) / mask_.size();
+  }
+  const std::vector<int>& mask() const { return mask_; }
+  const std::vector<T>& values() const { return values_; }
+  T* data() { return values_.data(); }
+  const T* data() const { return values_.data(); }
+  int rows() const { return rows_; }
+  int cols() const { return cols_; }
+  float sparsity() const { return sparsity_; }
+  void Print() const {
+    absl::PrintF("-------Values---------\n");
+    for (int r = 0; r < rows_; ++r) {
+      for (int c = 0; c < cols_; ++c) {
+        absl::PrintF("%+6.3f ", static_cast<float>(values_[r * cols_ + c]));
+      }
+      absl::PrintF("\n");
+    }
+    absl::PrintF("-------Mask---------\n");
+    for (int r = 0; r < rows_; ++r) {
+      for (int c = 0; c < cols_; ++c) {
+        printf("%2d ", mask_[r * cols_ + c]);
+      }
+      absl::PrintF("\n");
+    }
+  }
+  // This routine is useful for rounding the possibly higher precision values
+  // stored in this class to a lower precision, so that correctness checks
+  // between this class and CSRBlockSparseMatrix can have a tighter tolerance.
+  template <typename U>
+  void CastWeights() {
+    for (int i = 0; i < values_.size(); ++i) {
+      values_[i] = static_cast<T>(U(values_[i]));
+    }
+  }
+  // Only meant for correctness checking.
+  // RhsClassType is meant to be either CacheAlignedVector OR
+  // FatCacheAlignedVector.
+  // The weight matrix is ROW MAJOR and RhsClassType is COLUMN MAJOR.
+  // |bias| is broadcast if |rhs| has more than one column.
+  template <typename RhsClassType, typename BiasType, typename OutClassType,
+            typename RhsType = typename RhsClassType::value_type,
+            typename OutType = typename OutClassType::value_type>
+  void SpMM_bias(const RhsClassType& rhs,
+                 const CacheAlignedVector<BiasType>& bias, OutClassType* out,
+                 bool relu = false) {
+    for (int r = 0; r < rows_; ++r) {
+      for (int n = 0; n < rhs.cols(); ++n) {
+        float sum = 0.f;
+        const RhsType* rhs_ptr = rhs.data() + n * rhs.rows();
+        OutType* out_ptr = out->data() + n * out->rows();
+        const int* mask_ptr = mask_.data() + r * cols_;
+        const T* value_ptr = values_.data() + r * cols_;
+        for (int c = 0; c < cols_; ++c) {
+          sum += mask_ptr[c] * static_cast<float>(value_ptr[c]) *
+                 static_cast<float>(rhs_ptr[c]);
+        }
+        out_ptr[r] = static_cast<OutType>(
+            relu ? std::max(sum + static_cast<float>(bias[r]), 0.f)
+                 : sum + static_cast<float>(bias[r]));
+      }
+    }
+  }
+ private:
+  // Generate a random matrix with the specified sparsity.
+  // Useful for testing.
+  void init(float sparsity, int block_height, int block_width, float constant,
+            bool random = true) {
+    int reduced_rows = rows_ / block_height;
+    int reduced_cols = cols_ / block_width;
+    mask_.resize(rows_ * cols_, 0);
+    // Fill with non-zero value to make sure masking works.
+    values_.resize(rows_ * cols_, static_cast<T>(2.f));
+    std::mt19937 generator(0);
+    std::uniform_real_distribution<float> dist_sparsity;
+    std::uniform_real_distribution<float> dist_value(-1.f, 1.f);
+    int nnz = 0;
+    while (nnz == 0) {
+      for (int r = 0; r < reduced_rows; ++r) {
+        for (int c = 0; c < reduced_cols; ++c) {
+          if (dist_sparsity(generator) > sparsity) {
+            nnz++;
+            for (int i = 0; i < block_height; ++i) {
+              for (int j = 0; j < block_width; ++j) {
+                mask_[(r * block_height + i) * cols_ + block_width * c + j] = 1;
+                values_[(r * block_height + i) * cols_ + block_width * c + j] =
+                    static_cast<T>(random ? dist_value(generator) : constant);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<int> mask_;
+  std::vector<T> values_;
+  int rows_;
+  int cols_;
+  float sparsity_;
+};
+template <typename T>
+class MaskedLinearLayer {
+ public:
+  MaskedLinearLayer(MaskedSparseMatrix<T>&& weights,
+                    CacheAlignedVector<T>&& bias)
+      : weights_(std::move(weights)), bias_(std::move(bias)) {}
+  MaskedLinearLayer() {}
+  template <typename U>
+  void CastWeights() {
+    weights_.template CastWeights<U>();
+  }
+  // Does Ax + b where A is a masked sparse ROW MAJOR matrix and
+  // x is a COLUMN MAJOR dense vector or matrix.  Bias is a vector that is
+  // broadcast is rhs has more than one column.
+  template <typename FatVector>
+  void SpMM_bias(const FatVector& rhs, FatVector* out, bool relu = false) {
+    static_assert(std::is_same<typename FatVector::value_type, T>::value,
+                  "FatVector value_type must match masked_linear_layer type");
+    weights_.SpMM_bias(rhs, bias_, out, relu);
+  }
+ private:
+  MaskedSparseMatrix<T> weights_;
+  CacheAlignedVector<T> bias_;
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_MASKED_SPARSE_MATRIX_H_

sparse_matmul/layers/read_array_ifstream.h ADDED Viewed

	@@ -0,0 +1,66 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Low-level array reading function using std::ifstream.
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_READ_ARRAY_IFSTREAM_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_READ_ARRAY_IFSTREAM_H_
+#include <cstdint>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include "absl/status/status.h"
+#include "absl/strings/substitute.h"
+#include "include/ghc/filesystem.hpp"
+namespace csrblocksparse {
+namespace detail {
+template <typename T>
+absl::Status ReadArrayIfstream(const std::string& file_name,
+                               const std::string& path, std::vector<T>* array,
+                               int64_t* length) {
+  ghc::filesystem::path complete_path(path);
+  complete_path /= file_name;
+  std::ifstream in_stream(complete_path.u8string(), std::ios::binary);
+  if (!in_stream.is_open()) {
+    return absl::UnknownError(
+        absl::Substitute("Error opening $0", complete_path.string()));
+  }
+  std::stringstream buffer;
+  buffer << in_stream.rdbuf();
+  if (buffer.str().empty()) {
+    LOG(ERROR) << "File " << complete_path << " was empty.";
+    return absl::UnknownError(
+        absl::Substitute("File $0 was empty", complete_path.string()));
+  }
+  std::string contents = buffer.str();
+  *length = contents.length();
+  int64_t elem = (*length + sizeof(T) - 1) / sizeof(T);
+  array->resize(elem);
+  std::move(contents.begin(), contents.end(),
+            reinterpret_cast<char*>(array->data()));
+  return absl::OkStatus();
+}
+}  // namespace detail
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_READ_ARRAY_IFSTREAM_H_

sparse_matmul/layers/sparse_linear_layer.h ADDED Viewed

	@@ -0,0 +1,365 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_
+#include <cstdint>
+#include "absl/memory/memory.h"
+#include "glog/logging.h"
+#include "sparse_matmul/layers/csr_blocksparse_matrix.h"
+#include "sparse_matmul/layers/masked_sparse_matrix.h"
+#include "sparse_matmul/numerics/type_utils.h"
+#include "sparse_matmul/os/coop_threads.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+namespace csrblocksparse {
+template <typename WeightType, typename RhsType,
+          typename BiasType = typename TypeOfProduct<WeightType, RhsType>::type,
+          typename DeltaType = int16_t>
+class SparseLinearLayer {
+ public:
+  SparseLinearLayer() {}
+  SparseLinearLayer(CsrBlockSparseMatrix<WeightType, RhsType>&& sparse_matrix,
+                    CacheAlignedVector<BiasType>&& bias)
+      : sparse_matrix_(std::move(sparse_matrix)), full_bias_(std::move(bias)) {
+    CHECK_EQ(sparse_matrix_.rows(), full_bias_.size());
+    // Some kernels expect that the bias is divided by 4, so we store a second
+    // copy of a quarter of the bias.
+    // TODO(b/189958858): Remove the quartered bias if it can be done without
+    // loss of speed, and rename the |full_bias_| member back to |bias_|.
+    bias_ = full_bias_;
+    for (int i = 0; i < bias_.size(); ++i) {
+      bias_[i] = static_cast<BiasType>(.25f * static_cast<float>(bias_[i]));
+    }
+  }
+  SparseLinearLayer(
+      const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
+    *this = src;
+  }
+  SparseLinearLayer& operator=(
+      const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
+    sparse_matrix_ = src.sparse_matrix_;
+    bias_ = src.bias_;
+    full_bias_ = src.full_bias_;
+    mid_output_ = src.mid_output_;
+    thread_layers_ = src.thread_layers_;
+    num_threads_ = src.num_threads_;
+    if (src.split_pc_) {
+      split_pc_ = absl::make_unique<ProducerConsumer>(
+          src.split_pc_->num_producers(), src.split_pc_->num_consumers());
+    }
+    return *this;
+  }
+  // Does Ax + b where A is a block sparse compressed sparse row matrix and
+  // x is a COLUMN MAJOR dense vector or matrix.  Bias is a vector that is
+  // broadcast if rhs has more than one column.
+  template <typename RhsClassType, typename OutType>
+  void SpMM_bias(const RhsClassType& rhs, OutType* out, bool relu = false,
+                 int tid = 0, SpinBarrier* barrier = nullptr) const {
+    static_assert(
+        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
+    sparse_matrix_.SpMM_bias(rhs, bias_, out, relu, tid, barrier);
+  }
+  // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
+  // and then samples from the output (softmax distribution) layer.
+  template <typename RhsClassType, typename OutType>
+  int SpMM_bias_Sample(const RhsClassType& rhs, OutType* out, float temperature,
+                       int tid, SpinBarrier* barrier, std::minstd_rand* gen,
+                       CacheAlignedVector<float>* scratch) const {
+    static_assert(
+        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
+    return sparse_matrix_.SpMM_bias_Sample(rhs, bias_, out, temperature, tid,
+                                           barrier, gen, scratch);
+  }
+  template <typename RhsClassType, typename OutType>
+  void MatVec(const RhsClassType& rhs, bool relu, int tid, int replicas,
+              int output_stride, OutType* output,
+              SpinBarrier* barrier = nullptr) {
+    static_assert(
+        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
+#ifdef __AVX2__
+    if (block_width() == 4 && (block_height() == 4 || block_height() == 8) &&
+        !IsCustomFloatType<WeightType>::value) {
+      if (!IsSplit()) {
+        sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu,
+                              tid, replicas, output_stride, output->data());
+        if (barrier != nullptr) barrier->barrier();
+        return;
+      }
+      // NOTE: Until the quartered bias is removed it is a bad idea to split
+      // for ARM in the same way, as we would have to quarter the output of
+      // the first part of the split before running the second part.
+      // Signal completion of the previous MatVec.
+      split_pc_->produce();
+      PartLinearLayer& thread_part = thread_layers_[tid];
+      auto offset_output =
+          sparse_matrix_.thread_bounds().OffsetOutput(output->data(), tid);
+      auto mid_output =
+          sparse_matrix_.thread_bounds().OffsetOutput(mid_output_.data(), tid);
+      auto offset_bias = sparse_matrix_.thread_bounds().OffsetOutput(
+          mid_output_.cast_data(), tid);
+      // We can continue to consume the data that this thread produced and
+      // compute just the |self_matrix| part.
+      // No |relu| or |replicas|, as this is only a partial matmul.
+      // |tid| is always zero because the matrix has been split by tid.
+      thread_part.self_matrix.MatVec(
+          rhs.cast_data(), thread_part.full_bias.cast_data(), /*relu=*/false,
+          /*tid=*/0, /*replicas=*/1, output_stride, mid_output);
+      // We have to wait for the other threads to finish working on the previous
+      // MatMul before consuming the rest of |rhs|.
+      split_pc_->consume();
+      thread_part.other_matrix.MatVec(rhs.cast_data(), offset_bias, relu,
+                                      /*tid=*/0, replicas, output_stride,
+                                      offset_output);
+      return;
+    }
+#endif
+    DCHECK_EQ(replicas, 1) << "Must have single replica for SpMM API";
+    if (IsSplit()) {
+      // Generics aren't setup to use a split matrix. This will be inefficient.
+      split_pc_->produce();
+      split_pc_->consume();
+    }
+    if (block_height() == 8) {
+      // We are currently forced to use MatVec generics for this case.
+      LOG(WARNING) << "Need to implement MatVec for 8x4 for non-AVX2 targets!!";
+      sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, tid,
+                            replicas, output_stride, output->data());
+      if (barrier != nullptr) barrier->barrier();
+    } else {
+      sparse_matrix_.SpMM_bias(rhs, bias_, output, relu, tid, barrier);
+    }
+  }
+  int rows() const { return sparse_matrix_.rows(); }
+  int cols() const { return sparse_matrix_.cols(); }
+  float sparsity() const { return sparse_matrix_.sparsity(); }
+  int block_width() const { return sparse_matrix_.block_width(); }
+  int block_height() const { return sparse_matrix_.block_height(); }
+  int num_threads() const { return sparse_matrix_.num_threads(); }
+  const CacheAlignedVector<BiasType>& bias() const { return bias_; }
+  const std::vector<int>& split_points() const {
+    return sparse_matrix_.split_points();
+  }
+  bool IsSplit() const {
+    return !thread_layers_.empty() && split_pc_ != nullptr;
+  }
+  std::size_t bytes() const { return sparse_matrix_.bytes() + bias_.bytes(); }
+  void Print() const {
+    printf("Matrix\n");
+    sparse_matrix_.Print();
+    printf("Bias\n");
+    bias_.Print();
+  }
+  // Combines adjacent row blocks, doubling the block height.
+  // This necessarily involves adding zero weights where the blocks don't align
+  // across adjacent pairs of rows, so use with caution, as the resulting matrix
+  // is most likely to run slower if very sparse to begin with.
+  // In the few cases where the blocks do mostly align, the resulting matmul
+  // could be much faster, as the number of reads of the rhs will be halved.
+  void DoubleBlockHeight() { sparse_matrix_.DoubleBlockHeight(); }
+  // Cache_line_size is provided only for testing. Normally uses a value for
+  // the current architecture.
+  int PrepareForThreads(int num_threads, int cache_line_size = -1) {
+    num_threads_ = num_threads;
+    if (num_threads_ > 1) {
+      split_pc_ =
+          absl::make_unique<ProducerConsumer>(num_threads_, num_threads_);
+    } else {
+      split_pc_.reset(nullptr);
+    }
+    return sparse_matrix_.PrepareForThreads(num_threads, cache_line_size);
+  }
+  // Partitions the matrix into pieces by thread.
+  // In this matrix, we can go ahead and calculate the part that only depends
+  // on rhs inputs that were generated by this thread in the previous matvec,
+  // without having to use any thread synchronization, and only after that do we
+  // have to wait for the other threads to finish the previous matvec.
+  // So we split the matrix using the |split_points| from the previous matrix
+  // into 2 * |num_threads_| pieces: self and other for each thread, being the
+  //  parts that can be calculated before and after the other threads have
+  // completed their calculation of the previous matvec.
+  // We then have to use a ProducerConsumer lock instead of a SpinBarrier to
+  // synchronize the data produced by the other threads.
+  void SliceForThreads(const std::vector<int>& split_points) {
+    thread_layers_.clear();
+    thread_layers_.reserve(num_threads_);
+    LOG(INFO) << "Slicing " << rows() << "x" << cols() << " matrix for "
+              << num_threads_ << " threads";
+    for (int tid = 0; tid < num_threads_; ++tid) {
+      thread_layers_.emplace_back(
+          sparse_matrix_, full_bias_, bias_, tid,
+          split_points[tid] * sparse_matrix_.block_height(),
+          split_points[tid + 1] * sparse_matrix_.block_height());
+    }
+    mid_output_ =
+        std::move(csrblocksparse::CacheAlignedVector<BiasType>(rows()));
+    mid_output_.FillZero();
+  }
+  // Splits the layer by inputs into 2 equal pieces. Each of the resulting
+  // layers should be computed independently on the first and second halves of
+  // the inputs respectively and the results added to achieve the same effect
+  // as the original layer.
+  void SplitInputs(
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
+        sparse_matrix_.SplitByColumn(0, sparse_matrix_.cols() / 2));
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix2(
+        sparse_matrix_.SplitByColumn(sparse_matrix_.cols() / 2,
+                                     sparse_matrix_.cols()));
+    *part1 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix1),
+            std::move(CacheAlignedVector<BiasType>(full_bias_))));
+    CacheAlignedVector<BiasType> bias2(sparse_matrix_.rows());
+    bias2.FillZero();
+    *part2 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix2), std::move(bias2)));
+  }
+  // Splits the layer by outputs into 2 equal pieces. Each of the resulting
+  // layers should be computed independently on the full inputs and the results
+  // concatenated to achieve the same effect as the original layer.
+  void SplitOutputs(
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
+    LOG(INFO) << "input rows=" << sparse_matrix_.rows()
+              << ", cols=" << sparse_matrix_.cols();
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
+        sparse_matrix_.SplitByRow(0, sparse_matrix_.rows() / 2));
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix2(sparse_matrix_.SplitByRow(
+        sparse_matrix_.rows() / 2, sparse_matrix_.rows()));
+    CacheAlignedVector<BiasType> bias1(full_bias_, 0, full_bias_.size() / 2);
+    *part1 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix1), std::move(bias1)));
+    CacheAlignedVector<BiasType> bias2(full_bias_, full_bias_.size() / 2,
+                                       full_bias_.size());
+    *part2 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix2), std::move(bias2)));
+  }
+ private:
+  // Simple struct to hold a partitioned layer.
+  struct PartLinearLayer {
+    // The original matrix is first split by row to generate only the outputs
+    // for the given tid. The |row_sub_matrix| is then split by column into two
+    // partitions:
+    // self is the part for which the rhs elements in [|start_col|, |end_col|)
+    // were generated by this thread in some previous matmul.
+    // |other| is the rest of the columns that require rhs elements from other
+    // threads.
+    // NOTE that| start_col|, |end_col| are in raw columns, not blocks.
+    PartLinearLayer(const CsrBlockSparseMatrix<WeightType, RhsType>& matrix,
+                    const CacheAlignedVector<BiasType>& bias,
+                    const CacheAlignedVector<BiasType>& bias_4, int tid,
+                    int start_col, int end_col) {
+      int block_height = matrix.block_height();
+      // Split the input matrix by row, selecting only the rows relevant to
+      // thread tid.
+      int start_row = matrix.split_points()[tid] * block_height;
+      int end_row = matrix.split_points()[tid + 1] * block_height;
+      LOG(INFO) << "input cols [" << start_col << "," << end_col << ") rows ["
+                << start_row << "," << end_row << ")";
+      CsrBlockSparseMatrix<WeightType, RhsType> row_sub_matrix =
+          matrix.SplitByRow(start_row, end_row);
+      // Partition into the columns that use rhs elements that thread tid
+      // produced in a previous matmul, and the other rhs elements.
+      // NOTE that we |keep_rhs_size|=true so that each matrix can operate on
+      // the same rhs input vector. The self matrix just guarantees not to
+      // access any of the elements that are generated by another thread.
+      self_matrix = std::move(row_sub_matrix.SplitByColumn(
+          start_col, end_col, /*keep_rhs_size=*/true));
+      self_matrix.PrepareForThreads(1);
+      // The reversed start and end slice out the complement of [start, end).
+      other_matrix = std::move(row_sub_matrix.SplitByColumn(
+          end_col, start_col, /*keep_rhs_size=*/true));
+      other_matrix.PrepareForThreads(1);
+      full_bias =
+          std::move(CacheAlignedVector<BiasType>(bias, start_row, end_row));
+      // TODO(b/189958858): Eliminate the quarter bias from all the code.
+      quarter_bias =
+          std::move(CacheAlignedVector<BiasType>(bias_4, start_row, end_row));
+    }
+    // The part of the matrix that only depends on this thread for rhs inputs.
+    CsrBlockSparseMatrix<WeightType, RhsType> self_matrix;
+    CacheAlignedVector<BiasType> full_bias;
+    CacheAlignedVector<BiasType> quarter_bias;
+    // The part of the matrix that uses rhs inputs from other threads.
+    CsrBlockSparseMatrix<WeightType, RhsType> other_matrix;
+  };
+  CsrBlockSparseMatrix<WeightType, RhsType, DeltaType> sparse_matrix_;
+  CacheAlignedVector<BiasType> bias_;
+  CacheAlignedVector<BiasType> full_bias_;
+  // Output from the self_matrix that will be given to |other_matrix| as bias.
+  CacheAlignedVector<BiasType> mid_output_;
+  // One partitioned pair of matrices for each thread.
+  std::vector<PartLinearLayer> thread_layers_;
+  // Producer-consumer lock used to wait between computing |self_matrix| and
+  // |other_matrix| for the other threads to finish the *previous* matvec.
+  std::unique_ptr<ProducerConsumer> split_pc_;
+  int num_threads_ = 0;
+};
+template <typename WeightType, typename RhsType>
+SparseLinearLayer<WeightType, RhsType> CreateRandomLayer(int rows, int cols,
+                                                         float sparsity,
+                                                         int block_height = 1,
+                                                         int block_width = 1) {
+  typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
+  CacheAlignedVector<BiasType> bias(rows);
+  bias.FillRandom();
+  auto masked_matrix = MaskedSparseMatrix<float>(rows, cols, sparsity,
+                                                 block_height, block_width);
+  auto sparse_matrix = CsrBlockSparseMatrix<WeightType, RhsType>(masked_matrix);
+  return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
+                                                std::move(bias));
+}
+template <typename WeightType, typename RhsType>
+SparseLinearLayer<WeightType, RhsType> CreateConstantLayer(
+    int rows, int cols, float sparsity, float constant = 1.f) {
+  typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
+  CacheAlignedVector<BiasType> bias(rows);
+  bias.FillOnes();
+  MaskedSparseMatrix<float> masked_matrix(rows, cols, sparsity,
+                                          /*block_height=*/1, /*block_width=*/1,
+                                          constant, /*random=*/false);
+  CsrBlockSparseMatrix<WeightType, RhsType> sparse_matrix(masked_matrix);
+  return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
+                                                std::move(bias));
+}
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_

sparse_matmul/layers/sparse_linear_layer_test.cc ADDED Viewed

	@@ -0,0 +1,187 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/layers/sparse_linear_layer.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "sparse_matmul/numerics/test_utils.h"
+namespace csrblocksparse {
+namespace {
+constexpr int kBlockSize = 4;
+constexpr int kSize = 256;
+constexpr int kNumThreads = 4;
+constexpr int kCols = 1;
+void SlicedThreadBody(SpinBarrier* spin_barrier, int tid,
+                      const FatCacheAlignedVector<float>& rhs,
+                      SparseLinearLayer<float, float>* sparse_linear_layer,
+                      FatCacheAlignedVector<float>* out, bool use_relu) {
+  sparse_linear_layer->MatVec(rhs, use_relu, tid, /*replicas=*/1,
+                              /*output_stride=*/0, out);
+  spin_barrier->barrier();
+}
+// Tests that a Layer that has been SliceForThreads computes the same result as
+// the original layer. This is a basic test that all the slicing didn't mess up
+// any of the computations.
+TEST(CsrBlockSparseMatrix, SliceForThreads) {
+  MaskedSparseMatrix<float> matrix(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  FatCacheAlignedVector<float> rhs(kSize, kCols);
+  CacheAlignedVector<float> bias(kSize);
+  FatCacheAlignedVector<float> out1(kSize, kCols);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  FatCacheAlignedVector<float> out_reference = out1;
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
+                                                      std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out_reference);
+  std::vector<int> fake_split_points = {0, 48 / kBlockSize, 128 / kBlockSize,
+                                        208 / kBlockSize, kSize / kBlockSize};
+  sparse_linear_layer.PrepareForThreads(kNumThreads);
+  sparse_linear_layer.SliceForThreads(fake_split_points);
+  csrblocksparse::LaunchOnThreadsWithBarrier(kNumThreads, SlicedThreadBody, rhs,
+                                             &sparse_linear_layer, &out1,
+                                             /*relu=*/true);
+  CheckResult(out_reference, out1, kCols);
+}
+void LayersThreadBody(SpinBarrier* spin_barrier, int tid,
+                      const FatCacheAlignedVector<float>& rhs,
+                      SparseLinearLayer<float, float>* sparse_linear_layer1,
+                      SparseLinearLayer<float, float>* sparse_linear_layer2,
+                      FatCacheAlignedVector<float>* out1,
+                      FatCacheAlignedVector<float>* out2, bool use_relu) {
+  sparse_linear_layer1->MatVec(rhs, use_relu, tid, /*replicas=*/1,
+                               /*output_stride=*/0, out1);
+  // NOTE no barrier here!
+  sparse_linear_layer2->MatVec(*out1, use_relu, tid, /*replicas=*/1,
+                               /*output_stride=*/0, out2);
+  spin_barrier->barrier();
+}
+// Tests that a pair of layers computes the same result whether or not the
+// second layer has been SliceForThreads. This is a more critical test that
+// the replacement of barriers with producer-consumer locks works.
+// Must be run with tsan to really test it properly.
+TEST(CsrBlockSparseMatrix, SliceForThreadsLayers) {
+  MaskedSparseMatrix<float> matrix1(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  FatCacheAlignedVector<float> rhs(kSize, kCols);
+  CacheAlignedVector<float> bias1(kSize);
+  FatCacheAlignedVector<float> out1(kSize, kCols);
+  MaskedSparseMatrix<float> matrix2(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  CacheAlignedVector<float> bias2(kSize);
+  FatCacheAlignedVector<float> out2(kSize, kCols);
+  bias1.FillRandom();
+  rhs.FillRandom();
+  bias2.FillRandom();
+  out1.FillZero();
+  out2.FillZero();
+  FatCacheAlignedVector<float> out_reference = out2;
+  CsrBlockSparseMatrix<float, float> sparse_matrix1(matrix1);
+  SparseLinearLayer<float, float> layer1(std::move(sparse_matrix1),
+                                         std::move(bias1));
+  CsrBlockSparseMatrix<float, float> sparse_matrix2(matrix2);
+  SparseLinearLayer<float, float> layer2(std::move(sparse_matrix2),
+                                         std::move(bias2));
+  layer1.PrepareForThreads(1);
+  layer2.PrepareForThreads(1);
+  layer1.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                /*output_stride=*/0, &out1);
+  layer2.MatVec(out1, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                /*output_stride=*/0, &out_reference);
+  layer1.PrepareForThreads(kNumThreads);
+  layer2.PrepareForThreads(kNumThreads);
+  layer2.SliceForThreads(layer1.split_points());
+  csrblocksparse::LaunchOnThreadsWithBarrier(kNumThreads, LayersThreadBody, rhs,
+                                             &layer1, &layer2, &out1, &out2,
+                                             /*relu=*/true);
+  CheckResult(out_reference, out2, kCols);
+}
+// Tests that a Layer that has been DoubleBlockHeight()-ed computes the same
+// result as original layer. (Float compute type).
+TEST(CsrBlockSparseMatrix, Float8x4) {
+  using ComputeType = float;
+  using RhsType = float;
+  using BiasType = float;
+  MaskedSparseMatrix<float> matrix(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(kSize, kCols);
+  CacheAlignedVector<BiasType> bias(kSize);
+  FatCacheAlignedVector<BiasType> out1(kSize, kCols);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  FatCacheAlignedVector<BiasType> out_reference = out1;
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out_reference);
+  sparse_linear_layer.DoubleBlockHeight();
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out1);
+  CheckResult(out_reference, out1, kCols);
+}
+// Tests that a Layer that has been DoubleBlockHeight()-ed computes the same
+// result as original layer. (Fixed16 compute type).
+TEST(CsrBlockSparseMatrix, Fixed8x4) {
+  using ComputeType = csrblocksparse::fixed16<4>;
+  using RhsType = csrblocksparse::fixed16<4>;
+  using BiasType = typename TypeOfProduct<ComputeType, RhsType>::type;
+  MaskedSparseMatrix<float> matrix(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(kSize, kCols);
+  CacheAlignedVector<BiasType> bias(kSize);
+  FatCacheAlignedVector<BiasType> out1(kSize, kCols);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  FatCacheAlignedVector<BiasType> out_reference = out1;
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/false, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out_reference);
+  sparse_linear_layer.DoubleBlockHeight();
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/false, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out1);
+  CheckResult(out_reference, out1, kCols);
+}
+TEST(SparseLinearLayerTest, PrintCompiles) {
+  SparseLinearLayer<float, float> sparse_linear_layer;
+  sparse_linear_layer.Print();
+}
+}  // namespace
+}  // namespace csrblocksparse

sparse_matmul/layers/status_macros.h ADDED Viewed

	@@ -0,0 +1,34 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_STATUS_MACROS_H_
+#define THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_STATUS_MACROS_H_
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#define SPARSE_MATMUL_RETURN_IF_ERROR(expr) \
+  do {                                      \
+    const absl::Status _status = (expr);    \
+    if (!_status.ok()) return _status;      \
+  } while (0)
+template <typename T>
+absl::Status DoAssignOrReturn(T& lhs, absl::StatusOr<T> result) {
+  if (result.ok()) {
+    lhs = result.value();
+  }
+  return result.status();
+}
+#endif  // THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_STATUS_MACROS_H_

sparse_matmul/layers/testdata/768_512_95_4x4_QRhat_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50f861af29b1f767830d74ef83874944b18d80157b6b0256fdc4c14fa79ec936
+size 20852

sparse_matmul/layers/testdata/768_512_95_4x4_What_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d534bde2caf6e59990a46b4b1907088b8144c53d62d97de7e2b4bdc956da68
+size 5133

sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11399f9d0e8f8dfbef6eb37e0c096f858658bc650f728a08f3135ccca44f0a5a
+size 1062

sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3d971e067a6df985d68beac26bcf4e9a6cc13ff328599e84d50a0fc9a7c103b
+size 2382

sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1376ef7a360699dae24a49f40a254990d4a70b844dadcdbe9dcbf1a306999a8
+size 55829

sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffcc8ccf086fccfacc928877aa29ef03ce51cce0f0b7d2aacf81782b7b527089
+size 2003

sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a16f98ba6f09031ea9fefb79fdc9ba90e44f0046ab70dab014ac971ca7f7186
+size 4684

sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b91304f5b6f7b53651ec7f9c827d4a2447366d1f990032adff46b18377741f
+size 113777

sparse_matmul/layers/testdata/768_512_95_4x4_finelogit_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ebb84ab4e16408f898b41a28c0d2c611f6735c8d9ad96a6805947c57cb547c7
+size 1055

sparse_matmul/layers/testdata/768_512_95_4x4_finelogit_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:071159e5397eff604ff3f1fca3ba90980a1ff9ae12838022179709d2c50e4627
+size 2322

sparse_matmul/layers/testdata/768_512_95_4x4_finelogit_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fdd0cbc0e79ea0a0dc1fc2ce8b10c5f25387fb4fd2ca019b66ac7ad7f44d219
+size 51615

sparse_matmul/layers/testdata/768_512_95_4x4_fineproj_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abd83a1795fd5e7044200029eae3ce6406b84095b7128288ac0dda1de5746b59
+size 2001

sparse_matmul/layers/testdata/768_512_95_4x4_fineproj_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:455e1c142dd29bc4a4bb5a15c1f88ef3e0fbb580425620ef6f923b6e04faab01
+size 4459

sparse_matmul/layers/testdata/768_512_95_4x4_fineproj_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:171d1e86e04fbefeca7dcce59817ad82d30556a110b4552cd5757a9348405d1c
+size 111636

sparse_matmul/layers/testdata/768_512_95_4x4_wavernn_gru_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fba804daa5c3c4d5c87ca1ff4060d118c33f8e2201077e6faa233822c5f0c511
+size 10706

sparse_matmul/layers/testdata/768_512_95_4x4_wavernn_gru_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62c03b31f5f58eb67773dcc5b0bae5b4790a26dca1934d79802342b4175e7a74
+size 50978

sparse_matmul/layers/testdata/768_512_95_4x4_wavernn_gru_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:679c5bd2d5ca6abaae96225e8bab2ce9f9d57170027471465c85fc220c0c44a8
+size 1361746