Spaces:

juancopi81
/

youtube-music-transcribe

Build error

App Files Files Community

juancopi81 commited on Nov 4, 2022

Commit

b100e1c

•

1 Parent(s): 8f8dcb6

Add t5x and mt3 models

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +202 -0
README.md +0 -2
mt3/__init__.py +33 -0
mt3/datasets.py +325 -0
mt3/event_codec.py +112 -0
mt3/event_codec_test.py +55 -0
mt3/gin/eval.gin +72 -0
mt3/gin/infer.gin +92 -0
mt3/gin/ismir2021.gin +9 -0
mt3/gin/ismir2022/base.gin +10 -0
mt3/gin/ismir2022/finetune.gin +25 -0
mt3/gin/ismir2022/pretrain.gin +13 -0
mt3/gin/ismir2022/small.gin +2 -0
mt3/gin/local_tiny.gin +63 -0
mt3/gin/model.gin +60 -0
mt3/gin/mt3.gin +9 -0
mt3/gin/train.gin +148 -0
mt3/inference.py +138 -0
mt3/layers.py +830 -0
mt3/layers_test.py +545 -0
mt3/metrics.py +392 -0
mt3/metrics_utils.py +196 -0
mt3/metrics_utils_test.py +259 -0
mt3/mixing.py +91 -0
mt3/models.py +152 -0
mt3/network.py +409 -0
mt3/note_sequences.py +446 -0
mt3/note_sequences_test.py +505 -0
mt3/preprocessors.py +669 -0
mt3/pytest.ini +3 -0
mt3/run_length_encoding.py +423 -0
mt3/run_length_encoding_test.py +107 -0
mt3/scripts/dump_task.py +80 -0
mt3/scripts/extract_monophonic_examples.py +251 -0
mt3/spectrograms.py +82 -0
mt3/summaries.py +471 -0
mt3/tasks.py +402 -0
mt3/version.py +16 -0
mt3/vocabularies.py +282 -0
mt3/vocabularies_test.py +114 -0
pytest.ini +3 -0
setup.cfg +2 -0
setup.py +67 -0
t5x/__init__.py +34 -0
t5x/adafactor.py +608 -0
t5x/adafactor_test.py +527 -0
t5x/checkpoint_importer.py +485 -0
t5x/checkpoint_importer_test.py +81 -0
t5x/checkpoint_utils.py +91 -0
t5x/checkpoint_utils_test.py +149 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -10,5 +10,3 @@ app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: apache-2.0
 ---

mt3/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base module for MT3."""
+from mt3 import datasets
+from mt3 import event_codec
+from mt3 import inference
+from mt3 import layers
+from mt3 import metrics
+from mt3 import metrics_utils
+from mt3 import models
+from mt3 import network
+from mt3 import note_sequences
+from mt3 import preprocessors
+from mt3 import run_length_encoding
+from mt3 import spectrograms
+from mt3 import summaries
+from mt3 import tasks
+from mt3 import vocabularies
+from mt3.version import __version__

mt3/datasets.py ADDED Viewed

	@@ -0,0 +1,325 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataset configurations."""
+import dataclasses
+from typing import Mapping, Sequence, Union
+from mt3 import note_sequences
+import tensorflow as tf
+@dataclasses.dataclass
+class InferEvalSplit:
+  # key in dictionary containing all dataset splits
+  name: str
+  # task name suffix (each eval split is a separate task)
+  suffix: str
+  # whether or not to include in the mixture of all eval tasks
+  include_in_mixture: bool = True
+@dataclasses.dataclass
+class DatasetConfig:
+  """Configuration for a transcription dataset."""
+  # dataset name
+  name: str
+  # mapping from split name to path
+  paths: Mapping[str, str]
+  # mapping from feature name to feature
+  features: Mapping[str, Union[tf.io.FixedLenFeature,
+                               tf.io.FixedLenSequenceFeature]]
+  # training split name
+  train_split: str
+  # training eval split name
+  train_eval_split: str
+  # list of infer eval split specs
+  infer_eval_splits: Sequence[InferEvalSplit]
+  # list of track specs to be used for metrics
+  track_specs: Sequence[note_sequences.TrackSpec] = dataclasses.field(
+      default_factory=list)
+MAESTROV1_CONFIG = DatasetConfig(
+    name='maestrov1',
+    paths={
+        'train':
+            'gs://magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0_ns_wav_train.tfrecord-?????-of-00010',
+        'train_subset':
+            'gs://magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0_ns_wav_train.tfrecord-00002-of-00010',
+        'validation':
+            'gs://magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0_ns_wav_validation.tfrecord-?????-of-00010',
+        'validation_subset':
+            'gs://magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0_ns_wav_validation.tfrecord-0000[06]-of-00010',
+        'test':
+            'gs://magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0_ns_wav_test.tfrecord-?????-of-00010'
+    },
+    features={
+        'audio': tf.io.FixedLenFeature([], dtype=tf.string),
+        'sequence': tf.io.FixedLenFeature([], dtype=tf.string),
+        'id': tf.io.FixedLenFeature([], dtype=tf.string)
+    },
+    train_split='train',
+    train_eval_split='validation_subset',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='train_subset', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='validation_subset', suffix='validation'),
+        InferEvalSplit(name='test', suffix='test', include_in_mixture=False)
+    ])
+MAESTROV3_CONFIG = DatasetConfig(
+    name='maestrov3',
+    paths={
+        'train':
+            'gs://magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0_ns_wav_train.tfrecord-?????-of-00025',
+        'train_subset':
+            'gs://magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0_ns_wav_train.tfrecord-00004-of-00025',
+        'validation':
+            'gs://magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0_ns_wav_validation.tfrecord-?????-of-00025',
+        'validation_subset':
+            'gs://magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0_ns_wav_validation.tfrecord-0002?-of-00025',
+        'test':
+            'gs://magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0_ns_wav_test.tfrecord-?????-of-00025'
+    },
+    features={
+        'audio': tf.io.FixedLenFeature([], dtype=tf.string),
+        'sequence': tf.io.FixedLenFeature([], dtype=tf.string),
+        'id': tf.io.FixedLenFeature([], dtype=tf.string)
+    },
+    train_split='train',
+    train_eval_split='validation_subset',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='train_subset', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='validation_subset', suffix='validation'),
+        InferEvalSplit(name='test', suffix='test', include_in_mixture=False)
+    ])
+GUITARSET_CONFIG = DatasetConfig(
+    name='guitarset',
+    paths={
+        'train':
+            'gs://mt3/data/datasets/guitarset/train.tfrecord-?????-of-00019',
+        'validation':
+            'gs://mt3/data/datasets/guitarset/validation.tfrecord-?????-of-00006',
+    },
+    features={
+        'sequence': tf.io.FixedLenFeature([], dtype=tf.string),
+        'audio': tf.io.FixedLenFeature([], dtype=tf.string),
+        'velocity_range': tf.io.FixedLenFeature([], dtype=tf.string),
+        'id': tf.io.FixedLenFeature([], dtype=tf.string),
+    },
+    train_split='train',
+    train_eval_split='validation',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation'),
+    ])
+URMP_CONFIG = DatasetConfig(
+    name='urmp',
+    paths={
+        'train': 'gs://mt3/data/datasets/urmp/train.tfrecord',
+        'validation': 'gs://mt3/data/datasets/urmp/validation.tfrecord',
+    },
+    features={
+        'id': tf.io.FixedLenFeature([], dtype=tf.string),
+        'tracks': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.int64, allow_missing=True),
+        'inst_names': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.string, allow_missing=True),
+        'audio': tf.io.FixedLenFeature([], dtype=tf.string),
+        'sequence': tf.io.FixedLenFeature([], dtype=tf.string),
+        'instrument_sequences': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.string, allow_missing=True),
+    },
+    train_split='train',
+    train_eval_split='validation',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation')
+    ])
+MUSICNET_CONFIG = DatasetConfig(
+    name='musicnet',
+    paths={
+        'train':
+            'gs://mt3/data/datasets/musicnet/musicnet-train.tfrecord-?????-of-00036',
+        'validation':
+            'gs://mt3/data/datasets/musicnet/musicnet-validation.tfrecord-?????-of-00005',
+        'test':
+            'gs://mt3/data/datasets/musicnet/musicnet-test.tfrecord-?????-of-00003'
+    },
+    features={
+        'id': tf.io.FixedLenFeature([], dtype=tf.string),
+        'sample_rate': tf.io.FixedLenFeature([], dtype=tf.float32),
+        'audio': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.float32, allow_missing=True),
+        'sequence': tf.io.FixedLenFeature([], dtype=tf.string)
+    },
+    train_split='train',
+    train_eval_split='validation',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation'),
+        InferEvalSplit(name='test', suffix='test', include_in_mixture=False)
+    ])
+MUSICNET_EM_CONFIG = DatasetConfig(
+    name='musicnet_em',
+    paths={
+        'train':
+            'gs://mt3/data/datasets/musicnet_em/train.tfrecord-?????-of-00103',
+        'validation':
+            'gs://mt3/data/datasets/musicnet_em/validation.tfrecord-?????-of-00005',
+        'test':
+            'gs://mt3/data/datasets/musicnet_em/test.tfrecord-?????-of-00006'
+    },
+    features={
+        'id': tf.io.FixedLenFeature([], dtype=tf.string),
+        'sample_rate': tf.io.FixedLenFeature([], dtype=tf.float32),
+        'audio': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.float32, allow_missing=True),
+        'sequence': tf.io.FixedLenFeature([], dtype=tf.string)
+    },
+    train_split='train',
+    train_eval_split='validation',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation'),
+        InferEvalSplit(name='test', suffix='test', include_in_mixture=False)
+    ])
+CERBERUS4_CONFIG = DatasetConfig(
+    name='cerberus4',
+    paths={
+        'train':
+            'gs://mt3/data/datasets/cerberus4/slakh_multi_cerberus_train_bass:drums:guitar:piano.tfrecord-?????-of-00286',
+        'train_subset':
+            'gs://mt3/data/datasets/cerberus4/slakh_multi_cerberus_train_bass:drums:guitar:piano.tfrecord-00000-of-00286',
+        'validation':
+            'gs://mt3/data/datasets/cerberus4/slakh_multi_cerberus_validation_bass:drums:guitar:piano.tfrecord-?????-of-00212',
+        'validation_subset':
+            'gs://mt3/data/datasets/cerberus4/slakh_multi_cerberus_validation_bass:drums:guitar:piano.tfrecord-0000?-of-00212',
+        'test':
+            'gs://mt3/data/datasets/cerberus4/slakh_multi_cerberus_test_bass:drums:guitar:piano.tfrecord-?????-of-00106'
+    },
+    features={
+        'audio_sample_rate': tf.io.FixedLenFeature([], dtype=tf.int64),
+        'inst_names': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.string, allow_missing=True),
+        'midi_class': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.int64, allow_missing=True),
+        'mix': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.float32, allow_missing=True),
+        'note_sequences': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.string, allow_missing=True),
+        'plugin_name': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.int64, allow_missing=True),
+        'program_num': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.int64, allow_missing=True),
+        'slakh_class': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.int64, allow_missing=True),
+        'src_ids': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.string, allow_missing=True),
+        'stems': tf.io.FixedLenSequenceFeature(
+            [], dtype=tf.float32, allow_missing=True),
+        'stems_shape': tf.io.FixedLenFeature([2], dtype=tf.int64),
+        'target_type': tf.io.FixedLenFeature([], dtype=tf.string),
+        'track_id': tf.io.FixedLenFeature([], dtype=tf.string),
+    },
+    train_split='train',
+    train_eval_split='validation_subset',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='train_subset', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='validation_subset', suffix='validation'),
+        InferEvalSplit(name='test', suffix='test', include_in_mixture=False)
+    ],
+    track_specs=[
+        note_sequences.TrackSpec('bass', program=32),
+        note_sequences.TrackSpec('drums', is_drum=True),
+        note_sequences.TrackSpec('guitar', program=24),
+        note_sequences.TrackSpec('piano', program=0)
+    ])
+SLAKH_CONFIG = DatasetConfig(
+    name='slakh',
+    paths={
+        'train':
+            'gs://mt3/data/datasets/slakh/slakh_multi_full_subsets_10_train_all_inst.tfrecord-?????-of-02307',
+        'train_subset':
+            'gs://mt3/data/datasets/slakh/slakh_multi_full_subsets_10_train_all_inst.tfrecord-00000-of-02307',
+        'validation':
+            'gs://mt3/data/datasets/slakh/slakh_multi_full_validation_all_inst.tfrecord-?????-of-00168',
+        'validation_subset':
+            'gs://mt3/data/datasets/slakh/slakh_multi_full_validation_all_inst.tfrecord-0000?-of-00168',
+        'test':
+            'gs://mt3/data/datasets/slakh/slakh_multi_full_test_all_inst.tfrecord-?????-of-00109'
+    },
+    features={
+        'audio_sample_rate': tf.io.FixedLenFeature([], dtype=tf.int64),
+        'inst_names': tf.io.FixedLenSequenceFeature([], dtype=tf.string,
+                                                    allow_missing=True),
+        'midi_class': tf.io.FixedLenSequenceFeature([], dtype=tf.int64,
+                                                    allow_missing=True),
+        'mix': tf.io.FixedLenSequenceFeature([], dtype=tf.float32,
+                                             allow_missing=True),
+        'note_sequences': tf.io.FixedLenSequenceFeature([], dtype=tf.string,
+                                                        allow_missing=True),
+        'plugin_name': tf.io.FixedLenSequenceFeature([], dtype=tf.int64,
+                                                     allow_missing=True),
+        'program_num': tf.io.FixedLenSequenceFeature([], dtype=tf.int64,
+                                                     allow_missing=True),
+        'slakh_class': tf.io.FixedLenSequenceFeature([], dtype=tf.int64,
+                                                     allow_missing=True),
+        'src_ids': tf.io.FixedLenSequenceFeature([], dtype=tf.string,
+                                                 allow_missing=True),
+        'stems': tf.io.FixedLenSequenceFeature([], dtype=tf.float32,
+                                               allow_missing=True),
+        'stems_shape': tf.io.FixedLenFeature([2], dtype=tf.int64),
+        'target_type': tf.io.FixedLenFeature([], dtype=tf.string),
+        'track_id': tf.io.FixedLenFeature([], dtype=tf.string),
+    },
+    train_split='train',
+    train_eval_split='validation_subset',
+    infer_eval_splits=[
+        InferEvalSplit(name='train', suffix='eval_train_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='train_subset', suffix='eval_train'),
+        InferEvalSplit(name='validation', suffix='validation_full',
+                       include_in_mixture=False),
+        InferEvalSplit(name='validation_subset', suffix='validation'),
+        InferEvalSplit(name='test', suffix='test', include_in_mixture=False)
+    ])

mt3/event_codec.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Encode and decode events."""
+import dataclasses
+from typing import List, Tuple
+@dataclasses.dataclass
+class EventRange:
+  type: str
+  min_value: int
+  max_value: int
+@dataclasses.dataclass
+class Event:
+  type: str
+  value: int
+class Codec:
+  """Encode and decode events.
+  Useful for declaring what certain ranges of a vocabulary should be used for.
+  This is intended to be used from Python before encoding or after decoding with
+  GenericTokenVocabulary. This class is more lightweight and does not include
+  things like EOS or UNK token handling.
+  To ensure that 'shift' events are always the first block of the vocab and
+  start at 0, that event type is required and specified separately.
+  """
+  def __init__(self, max_shift_steps: int, steps_per_second: float,
+               event_ranges: List[EventRange]):
+    """Define Codec.
+    Args:
+      max_shift_steps: Maximum number of shift steps that can be encoded.
+      steps_per_second: Shift steps will be interpreted as having a duration of
+          1 / steps_per_second.
+      event_ranges: Other supported event types and their ranges.
+    """
+    self.steps_per_second = steps_per_second
+    self._shift_range = EventRange(
+        type='shift', min_value=0, max_value=max_shift_steps)
+    self._event_ranges = [self._shift_range] + event_ranges
+    # Ensure all event types have unique names.
+    assert len(self._event_ranges) == len(
+        set([er.type for er in self._event_ranges]))
+  @property
+  def num_classes(self) -> int:
+    return sum(er.max_value - er.min_value + 1 for er in self._event_ranges)
+  # The next couple methods are simplified special case methods just for shift
+  # events that are intended to be used from within autograph functions.
+  def is_shift_event_index(self, index: int) -> bool:
+    return (self._shift_range.min_value <= index) and (
+        index <= self._shift_range.max_value)
+  @property
+  def max_shift_steps(self) -> int:
+    return self._shift_range.max_value
+  def encode_event(self, event: Event) -> int:
+    """Encode an event to an index."""
+    offset = 0
+    for er in self._event_ranges:
+      if event.type == er.type:
+        if not er.min_value <= event.value <= er.max_value:
+          raise ValueError(
+              f'Event value {event.value} is not within valid range '
+              f'[{er.min_value}, {er.max_value}] for type {event.type}')
+        return offset + event.value - er.min_value
+      offset += er.max_value - er.min_value + 1
+    raise ValueError(f'Unknown event type: {event.type}')
+  def event_type_range(self, event_type: str) -> Tuple[int, int]:
+    """Return [min_id, max_id] for an event type."""
+    offset = 0
+    for er in self._event_ranges:
+      if event_type == er.type:
+        return offset, offset + (er.max_value - er.min_value)
+      offset += er.max_value - er.min_value + 1
+    raise ValueError(f'Unknown event type: {event_type}')
+  def decode_event_index(self, index: int) -> Event:
+    """Decode an event index to an Event."""
+    offset = 0
+    for er in self._event_ranges:
+      if offset <= index <= offset + er.max_value - er.min_value:
+        return Event(
+            type=er.type, value=er.min_value + index - offset)
+      offset += er.max_value - er.min_value + 1
+    raise ValueError(f'Unknown event index: {index}')

mt3/event_codec_test.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for event_codec."""
+from absl.testing import absltest
+from mt3 import event_codec
+Event = event_codec.Event
+EventRange = event_codec.EventRange
+class EventCodecTest(absltest.TestCase):
+  def test_encode_decode(self):
+    ec = event_codec.Codec(
+        max_shift_steps=100,
+        steps_per_second=100,
+        event_ranges=[EventRange('pitch', min_value=0, max_value=127)])
+    events = [
+        Event(type='pitch', value=60),
+        Event(type='shift', value=5),
+        Event(type='pitch', value=62),
+    ]
+    encoded = [ec.encode_event(e) for e in events]
+    self.assertSequenceEqual([161, 5, 163], encoded)
+    decoded = [ec.decode_event_index(idx) for idx in encoded]
+    self.assertSequenceEqual(events, decoded)
+  def test_shift_steps(self):
+    ec = event_codec.Codec(
+        max_shift_steps=100,
+        steps_per_second=100,
+        event_ranges=[EventRange('pitch', min_value=0, max_value=127)])
+    self.assertEqual(100, ec.max_shift_steps)
+    self.assertFalse(ec.is_shift_event_index(-1))
+    self.assertTrue(ec.is_shift_event_index(0))
+    self.assertTrue(ec.is_shift_event_index(100))
+    self.assertFalse(ec.is_shift_event_index(101))
+if __name__ == '__main__':
+  absltest.main()

mt3/gin/eval.gin ADDED Viewed

	@@ -0,0 +1,72 @@

+# Defaults for eval.py.
+#
+# You must also include a binding for MODEL.
+#
+# Required to be set:
+#
+# - TASK_PREFIX
+# - TASK_FEATURE_LENGTHS
+# - CHECKPOINT_PATH
+# - EVAL_OUTPUT_DIR
+#
+# Commonly overridden options:
+#
+# - DatasetConfig.split
+# - DatasetConfig.batch_size
+# - DatasetConfig.use_cached
+# - RestoreCheckpointConfig.mode
+# - PjitPartitioner.num_partitions
+from __gin__ import dynamic_registration
+import __main__ as eval_script
+from mt3 import preprocessors
+from mt3 import tasks
+from mt3 import vocabularies
+from t5x import partitioning
+from t5x import utils
+# Must be overridden
+TASK_PREFIX = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = %gin.REQUIRED
+CHECKPOINT_PATH = %gin.REQUIRED
+EVAL_OUTPUT_DIR = %gin.REQUIRED
+# Number of velocity bins: set to 1 (no velocity) or 127
+NUM_VELOCITY_BINS = %gin.REQUIRED
+VOCAB_CONFIG = @vocabularies.VocabularyConfig()
+vocabularies.VocabularyConfig.num_velocity_bins = %NUM_VELOCITY_BINS
+# Program granularity: set to 'flat', 'midi_class', or 'full'
+PROGRAM_GRANULARITY = %gin.REQUIRED
+preprocessors.map_midi_programs.granularity_type = %PROGRAM_GRANULARITY
+TASK_SUFFIX = 'test'
+tasks.construct_task_name:
+  task_prefix = %TASK_PREFIX
+  vocab_config = %VOCAB_CONFIG
+  task_suffix = %TASK_SUFFIX
+eval_script.evaluate:
+  model = %MODEL  # imported from separate gin file
+  dataset_cfg = @utils.DatasetConfig()
+  partitioner = @partitioning.PjitPartitioner()
+  restore_checkpoint_cfg = @utils.RestoreCheckpointConfig()
+  output_dir = %EVAL_OUTPUT_DIR
+utils.DatasetConfig:
+  mixture_or_task_name = @tasks.construct_task_name()
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+  split = 'eval'
+  batch_size = 32
+  shuffle = False
+  seed = 42
+  use_cached = True
+  pack = False
+  use_custom_packing_ops = False
+partitioning.PjitPartitioner.num_partitions = 1
+utils.RestoreCheckpointConfig:
+  path = %CHECKPOINT_PATH
+  mode = 'specific'

mt3/gin/infer.gin ADDED Viewed

	@@ -0,0 +1,92 @@

+# Defaults for infer.py.
+#
+# You must also include a binding for MODEL.
+#
+# Required to be set:
+#
+# - TASK_PREFIX
+# - TASK_FEATURE_LENGTHS
+# - CHECKPOINT_PATH
+# - INFER_OUTPUT_DIR
+#
+# Commonly overridden options:
+#
+# - infer.mode
+# - infer.checkpoint_period
+# - infer.shard_id
+# - infer.num_shards
+# - DatasetConfig.split
+# - DatasetConfig.batch_size
+# - DatasetConfig.use_cached
+# - RestoreCheckpointConfig.is_tensorflow
+# - RestoreCheckpointConfig.mode
+# - PjitPartitioner.num_partitions
+from __gin__ import dynamic_registration
+import __main__ as infer_script
+from mt3 import inference
+from mt3 import preprocessors
+from mt3 import tasks
+from mt3 import vocabularies
+from t5x import partitioning
+from t5x import utils
+# Must be overridden
+TASK_PREFIX = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = %gin.REQUIRED
+CHECKPOINT_PATH = %gin.REQUIRED
+INFER_OUTPUT_DIR = %gin.REQUIRED
+# Number of velocity bins: set to 1 (no velocity) or 127
+NUM_VELOCITY_BINS = %gin.REQUIRED
+VOCAB_CONFIG = @vocabularies.VocabularyConfig()
+vocabularies.VocabularyConfig.num_velocity_bins = %NUM_VELOCITY_BINS
+# Program granularity: set to 'flat', 'midi_class', or 'full'
+PROGRAM_GRANULARITY = %gin.REQUIRED
+preprocessors.map_midi_programs.granularity_type = %PROGRAM_GRANULARITY
+TASK_SUFFIX = 'test'
+tasks.construct_task_name:
+  task_prefix = %TASK_PREFIX
+  vocab_config = %VOCAB_CONFIG
+  task_suffix = %TASK_SUFFIX
+ONSETS_ONLY = %gin.REQUIRED
+USE_TIES = %gin.REQUIRED
+inference.write_inferences_to_file:
+  vocab_config = %VOCAB_CONFIG
+  onsets_only = %ONSETS_ONLY
+  use_ties = %USE_TIES
+infer_script.infer:
+  mode = 'predict'
+  model = %MODEL  # imported from separate gin file
+  output_dir = %INFER_OUTPUT_DIR
+  dataset_cfg = @utils.DatasetConfig()
+  partitioner = @partitioning.PjitPartitioner()
+  restore_checkpoint_cfg = @utils.RestoreCheckpointConfig()
+  # This is a hack, but pass an extremely large value here to make sure the
+  # entire dataset fits in a single epoch. Otherwise, segments from a single
+  # example may end up in different epochs after splitting.
+  checkpoint_period = 1000000
+  shard_id = 0
+  num_shards = 1
+  write_fn = @inference.write_inferences_to_file
+utils.DatasetConfig:
+  mixture_or_task_name = @tasks.construct_task_name()
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+  use_cached = True
+  split = 'eval'
+  batch_size = 32
+  shuffle = False
+  seed = 0
+  pack = False
+partitioning.PjitPartitioner.num_partitions = 1
+utils.RestoreCheckpointConfig:
+  path = %CHECKPOINT_PATH
+  mode = 'specific'

mt3/gin/ismir2021.gin ADDED Viewed

	@@ -0,0 +1,9 @@

+# Configuration for ISMIR 2021 piano-only model.
+TASK_PREFIX = 'maestrov3_notes'
+TASK_FEATURE_LENGTHS = {'inputs': 512, 'targets': 1024}
+TRAIN_STEPS = 400000
+NUM_VELOCITY_BINS = 127
+PROGRAM_GRANULARITY = 'flat'
+ONSETS_ONLY = False
+USE_TIES = False

mt3/gin/ismir2022/base.gin ADDED Viewed

	@@ -0,0 +1,10 @@

+# T5.1.1 Base model.
+include 'model.gin'
+network.T5Config:
+  emb_dim = 768
+  num_heads = 12
+  num_encoder_layers = 12
+  num_decoder_layers = 12
+  head_dim = 64
+  mlp_dim = 2048

mt3/gin/ismir2022/finetune.gin ADDED Viewed

	@@ -0,0 +1,25 @@

+from __gin__ import dynamic_registration
+from mt3 import network
+from t5x import utils
+include 'train.gin'
+TASK_PREFIX = 'mega_notes_ties'
+TASK_FEATURE_LENGTHS = {'inputs': 256, 'targets': 1024}
+TRAIN_STEPS = 150000
+BATCH_SIZE = 256
+LABEL_SMOOTHING = 0.0
+NUM_VELOCITY_BINS = 1
+PROGRAM_GRANULARITY = 'full'
+ONSETS_ONLY = False
+USE_TIES = True
+MAX_EXAMPLES_PER_MIX = None
+network.T5Config.dropout_rate = 0.1
+CHECKPOINT_PATH = %gin.REQUIRED
+utils.CheckpointConfig.restore = @utils.RestoreCheckpointConfig()
+utils.RestoreCheckpointConfig:
+  path = %CHECKPOINT_PATH
+  mode = 'specific'

mt3/gin/ismir2022/pretrain.gin ADDED Viewed

	@@ -0,0 +1,13 @@

+include 'train.gin'
+TASK_FEATURE_LENGTHS = {'inputs': 256, 'targets': 1024}
+TRAIN_STEPS = 500000
+BATCH_SIZE = 1024
+LABEL_SMOOTHING = 0.1
+NUM_VELOCITY_BINS = 1
+PROGRAM_GRANULARITY = 'full'
+ONSETS_ONLY = False
+USE_TIES = True
+MAX_EXAMPLES_PER_MIX = 8
+network.T5Config.dropout_rate = 0.0

mt3/gin/ismir2022/small.gin ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # T5.1.1 Small model.
2	+ include 'model.gin'

mt3/gin/local_tiny.gin ADDED Viewed

	@@ -0,0 +1,63 @@

+# A gin file to make the Transformer models tiny for faster local testing.
+#
+# When testing locally with CPU, there are a few things that we need.
+# - tiny model size
+# - small enough batch size
+# - small sequence length
+# - determinstic dataset pipeline
+#
+# This gin file adds such configs. To use this gin file, add it on top of the
+# existing full-scale gin files. The ordering of the gin file matters. So this
+# should be added after all the other files are added to override the same
+# configurables.
+from __gin__ import dynamic_registration
+from t5x import partitioning
+from t5x import trainer
+from t5x import utils
+from t5x.examples.t5 import network
+import __main__ as train_script
+train_script.train.random_seed = 42  # dropout seed
+train/utils.DatasetConfig.seed = 42  # dataset seed
+TASK_FEATURE_LENGTHS = {"inputs": 8, "targets": 16}
+LABEL_SMOOTHING = 0.0
+# Network specification overrides
+network.Transformer.config = @network.T5Config()
+network.T5Config:
+  dtype = 'float32'
+  emb_dim = 8
+  num_heads = 4
+  num_encoder_layers = 2
+  num_decoder_layers = 2
+  head_dim = 3
+  mlp_dim = 16
+  mlp_activations = ('gelu', 'linear')
+  dropout_rate = 0.0
+  logits_via_embedding = False
+TRAIN_STEPS = 3
+train/utils.DatasetConfig:
+  batch_size = 8
+  shuffle = False
+train_eval/utils.DatasetConfig.batch_size = 8
+train_script.train:
+  eval_period = 3
+  eval_steps = 3
+trainer.Trainer.num_microbatches = 0
+partitioning.PjitPartitioner:
+  num_partitions = 1
+  model_parallel_submesh = None
+utils.CheckpointConfig:
+  restore = None
+infer_eval/utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS

mt3/gin/model.gin ADDED Viewed

	@@ -0,0 +1,60 @@

+# T5.1.1 Small model.
+from __gin__ import dynamic_registration
+from mt3 import models
+from mt3 import network
+from mt3 import spectrograms
+from mt3 import vocabularies
+import seqio
+from t5x import adafactor
+# ------------------- Loss HParam ----------------------------------------------
+Z_LOSS = 0.0001
+LABEL_SMOOTHING = 0.0
+LOSS_NORMALIZING_FACTOR = None
+models.ContinuousInputsEncoderDecoderModel:
+  z_loss = %Z_LOSS
+  label_smoothing = %LABEL_SMOOTHING
+  loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR
+# Output vocabulary
+VOCAB_CONFIG = %gin.REQUIRED
+OUTPUT_VOCABULARY = @vocabularies.vocabulary_from_codec()
+vocabularies.vocabulary_from_codec.codec = @vocabularies.build_codec()
+vocabularies.build_codec.vocab_config = %VOCAB_CONFIG
+# ------------------- Optimizer ------------------------------------------------
+# `learning_rate` is set by `Trainer.learning_rate_fn`.
+OPTIMIZER = @adafactor.Adafactor()
+adafactor.Adafactor:
+  decay_rate = 0.8
+  step_offset = 0
+  logical_factor_rules = @adafactor.standard_logical_factor_rules()
+# ------------------- Model ----------------------------------------------------
+SPECTROGRAM_CONFIG = @spectrograms.SpectrogramConfig()
+MODEL = @models.ContinuousInputsEncoderDecoderModel()
+models.ContinuousInputsEncoderDecoderModel:
+  module = @network.Transformer()
+  input_vocabulary = @seqio.vocabularies.PassThroughVocabulary()
+  output_vocabulary = %OUTPUT_VOCABULARY
+  optimizer_def = %OPTIMIZER
+  input_depth = @spectrograms.input_depth()
+seqio.vocabularies.PassThroughVocabulary.size = 0
+spectrograms.input_depth.spectrogram_config = %SPECTROGRAM_CONFIG
+# ------------------- Network specification ------------------------------------
+network.Transformer.config = @network.T5Config()
+network.T5Config:
+  vocab_size = @vocabularies.num_embeddings()
+  dtype = 'float32'
+  emb_dim = 512
+  num_heads = 6
+  num_encoder_layers = 8
+  num_decoder_layers = 8
+  head_dim = 64
+  mlp_dim = 1024
+  mlp_activations = ('gelu', 'linear')
+  dropout_rate = 0.1
+  logits_via_embedding = False
+vocabularies.num_embeddings.vocabulary = %OUTPUT_VOCABULARY

mt3/gin/mt3.gin ADDED Viewed

	@@ -0,0 +1,9 @@

+# Configuration for MT3 multi-task multitrack model.
+TASK_PREFIX = 'mega_notes_ties'
+TASK_FEATURE_LENGTHS = {'inputs': 256, 'targets': 1024}
+TRAIN_STEPS = 1000000
+NUM_VELOCITY_BINS = 1
+PROGRAM_GRANULARITY = 'full'
+ONSETS_ONLY = False
+USE_TIES = True

mt3/gin/train.gin ADDED Viewed

	@@ -0,0 +1,148 @@

+# Defaults for training with train.py.
+#
+# You must also include a binding for MODEL.
+#
+# Required to be set:
+#
+# - TASK_PREFIX
+# - TASK_FEATURE_LENGTHS
+# - TRAIN_STEPS
+# - MODEL_DIR
+#
+# Commonly overridden options:
+# - BATCH_SIZE
+# - PjitPartitioner.num_partitions
+# - Trainer.num_microbatches
+# - USE_CACHED_TASKS: Whether to look for preprocessed SeqIO data, or preprocess
+#    on the fly.
+from __gin__ import dynamic_registration
+import __main__ as train_script
+import seqio
+from mt3 import mixing
+from mt3 import preprocessors
+from mt3 import tasks
+from mt3 import vocabularies
+from t5x import gin_utils
+from t5x import partitioning
+from t5x import utils
+from t5x import trainer
+# Must be overridden
+TASK_PREFIX = %gin.REQUIRED
+TASK_FEATURE_LENGTHS = %gin.REQUIRED
+TRAIN_STEPS = %gin.REQUIRED
+MODEL_DIR = %gin.REQUIRED
+# Commonly overridden
+TRAIN_TASK_SUFFIX = 'train'
+EVAL_TASK_SUFFIX = 'eval'
+USE_CACHED_TASKS = True
+BATCH_SIZE = 256
+# Sometimes overridden
+EVAL_STEPS = 20
+# Convenience overrides.
+EVALUATOR_USE_MEMORY_CACHE = True
+EVALUATOR_NUM_EXAMPLES = None  # Use all examples in the infer_eval dataset.
+JSON_WRITE_N_RESULTS = 0  # Don't write any inferences.
+# Number of velocity bins: set to 1 (no velocity) or 127
+NUM_VELOCITY_BINS = %gin.REQUIRED
+VOCAB_CONFIG = @vocabularies.VocabularyConfig()
+vocabularies.VocabularyConfig.num_velocity_bins = %NUM_VELOCITY_BINS
+# Program granularity: set to 'flat', 'midi_class', or 'full'
+PROGRAM_GRANULARITY = %gin.REQUIRED
+preprocessors.map_midi_programs.granularity_type = %PROGRAM_GRANULARITY
+# Maximum number of examples per mix, or None for no mixing
+MAX_EXAMPLES_PER_MIX = None
+mixing.mix_transcription_examples.max_examples_per_mix = %MAX_EXAMPLES_PER_MIX
+train/tasks.construct_task_name:
+  task_prefix = %TASK_PREFIX
+  vocab_config = %VOCAB_CONFIG
+  task_suffix = %TRAIN_TASK_SUFFIX
+eval/tasks.construct_task_name:
+  task_prefix = %TASK_PREFIX
+  vocab_config = %VOCAB_CONFIG
+  task_suffix = %EVAL_TASK_SUFFIX
+train_script.train:
+  model = %MODEL  # imported from separate gin file
+  model_dir = %MODEL_DIR
+  train_dataset_cfg = @train/utils.DatasetConfig()
+  train_eval_dataset_cfg = @train_eval/utils.DatasetConfig()
+  infer_eval_dataset_cfg = @infer_eval/utils.DatasetConfig()
+  checkpoint_cfg = @utils.CheckpointConfig()
+  partitioner = @partitioning.PjitPartitioner()
+  trainer_cls = @trainer.Trainer
+  total_steps = %TRAIN_STEPS
+  eval_steps = %EVAL_STEPS
+  eval_period = 5000
+  random_seed = None  # use faster, hardware RNG
+  summarize_config_fn = @gin_utils.summarize_gin_config
+  inference_evaluator_cls = @seqio.Evaluator
+seqio.Evaluator:
+  logger_cls = [@seqio.PyLoggingLogger, @seqio.TensorBoardLogger, @seqio.JSONLogger]
+  num_examples = %EVALUATOR_NUM_EXAMPLES
+  use_memory_cache = %EVALUATOR_USE_MEMORY_CACHE
+seqio.JSONLogger:
+  write_n_results = %JSON_WRITE_N_RESULTS
+train/utils.DatasetConfig:
+  mixture_or_task_name = @train/tasks.construct_task_name()
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+  split = 'train'
+  batch_size = %BATCH_SIZE
+  shuffle = True
+  seed = None  # use a new seed each run/restart
+  use_cached = %USE_CACHED_TASKS
+  pack = False
+train_eval/utils.DatasetConfig:
+  mixture_or_task_name = @train/tasks.construct_task_name()
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+  split = 'eval'
+  batch_size = %BATCH_SIZE
+  shuffle = False
+  seed = 42
+  use_cached = %USE_CACHED_TASKS
+  pack = False
+infer_eval/utils.DatasetConfig:
+  mixture_or_task_name = @eval/tasks.construct_task_name()
+  task_feature_lengths = %TASK_FEATURE_LENGTHS
+  split = 'eval'
+  batch_size = %BATCH_SIZE
+  shuffle = False
+  seed = 42
+  use_cached = %USE_CACHED_TASKS
+  pack = False
+utils.CheckpointConfig:
+  restore = None
+  save = @utils.SaveCheckpointConfig()
+utils.SaveCheckpointConfig:
+  period = 5000
+  dtype = 'float32'
+  keep = None  # keep all checkpoints
+  save_dataset = False  # don't checkpoint dataset state
+partitioning.PjitPartitioner:
+  num_partitions = 1
+  model_parallel_submesh = None
+trainer.Trainer:
+  num_microbatches = None
+  learning_rate_fn = @utils.create_learning_rate_scheduler()
+utils.create_learning_rate_scheduler:
+  factors = 'constant'
+  base_learning_rate = 0.001
+  warmup_steps = 1000

mt3/inference.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for MT3 inference."""
+import functools
+import json
+from typing import Any, Optional, Sequence
+import gin
+from mt3 import metrics_utils
+from mt3 import note_sequences
+from mt3 import tasks
+from mt3 import vocabularies
+import note_seq
+import seqio
+import tensorflow as tf
+def write_inferences_to_file(
+    path: str,
+    inferences: Sequence[Any],
+    task_ds: tf.data.Dataset,
+    mode: str,
+    vocabulary: Optional[seqio.Vocabulary] = None,
+    vocab_config=gin.REQUIRED,
+    onsets_only=gin.REQUIRED,
+    use_ties=gin.REQUIRED) -> None:
+  """Writes model predictions, ground truth transcriptions, and input audio.
+  For now this only works for transcription tasks with ties.
+  Args:
+    path: File path to write to.
+    inferences: Model inferences, output of predict_batch.
+    task_ds: Original task dataset.
+    mode: Prediction mode; must be 'predict' as 'score' is not supported.
+    vocabulary: Task output vocabulary.
+    vocab_config: Vocabulary config object.
+    onsets_only: If True, only predict onsets.
+    use_ties: If True, use "tie" representation.
+  """
+  if mode == 'score':
+    raise ValueError('`score` mode currently not supported in MT3')
+  if not vocabulary:
+    raise ValueError('`vocabulary` parameter required in `predict` mode')
+  if onsets_only and use_ties:
+    raise ValueError('ties not compatible with onset-only transcription')
+  if onsets_only:
+    encoding_spec = note_sequences.NoteOnsetEncodingSpec
+  elif not use_ties:
+    encoding_spec = note_sequences.NoteEncodingSpec
+  else:
+    encoding_spec = note_sequences.NoteEncodingWithTiesSpec
+  codec = vocabularies.build_codec(vocab_config)
+  targets = []
+  predictions = []
+  for inp, output in zip(task_ds.as_numpy_iterator(), inferences):
+    tokens = tasks.trim_eos(vocabulary.decode_tf(output).numpy())
+    start_time = inp['input_times'][0]
+    # Round down to nearest symbolic token step.
+    start_time -= start_time % (1 / codec.steps_per_second)
+    targets.append({
+        'unique_id': inp['unique_id'][0],
+        'ref_ns': inp['sequence'][0] if inp['sequence'][0] else None,
+    })
+    predictions.append({
+        'unique_id': inp['unique_id'][0],
+        'est_tokens': tokens,
+        'start_time': start_time,
+        # Input audio is not part of the "prediction" but the below call to
+        # metrics_utils.event_predictions_to_ns handles the concatenation.
+        'raw_inputs': inp['raw_inputs']
+    })
+  # The first target for each full example contains the NoteSequence; just
+  # organize by ID.
+  full_targets = {}
+  for target in targets:
+    if target['ref_ns']:
+      full_targets[target['unique_id']] = {
+          'ref_ns': note_seq.NoteSequence.FromString(target['ref_ns'])
+      }
+  full_predictions = metrics_utils.combine_predictions_by_id(
+      predictions=predictions,
+      combine_predictions_fn=functools.partial(
+          metrics_utils.event_predictions_to_ns,
+          codec=codec,
+          encoding_spec=encoding_spec))
+  assert sorted(full_targets.keys()) == sorted(full_predictions.keys())
+  full_target_prediction_pairs = [
+      (full_targets[id], full_predictions[id])
+      for id in sorted(full_targets.keys())
+  ]
+  def note_to_dict(note):
+    return {
+        'start_time': note.start_time,
+        'end_time': note.end_time,
+        'pitch': note.pitch,
+        'velocity': note.velocity,
+        'program': note.program,
+        'is_drum': note.is_drum
+    }
+  with tf.io.gfile.GFile(path, 'w') as f:
+    for target, prediction in full_target_prediction_pairs:
+      json_dict = {
+          'id': target['ref_ns'].id,
+          'est_notes':
+              [note_to_dict(note) for note in prediction['est_ns'].notes]
+      }
+      json_str = json.dumps(json_dict, cls=seqio.TensorAndNumpyEncoder)
+      f.write(json_str + '\n')

mt3/layers.py ADDED Viewed

	@@ -0,0 +1,830 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dense attention classes and mask/weighting functions."""
+# pylint: disable=attribute-defined-outside-init,g-bare-generic
+import dataclasses
+import functools
+import operator
+from typing import Any, Callable, Iterable, Optional, Sequence, Tuple, Union
+from flax import linen as nn
+from flax.linen import partitioning as nn_partitioning
+import jax
+from jax import lax
+from jax import random
+import jax.numpy as jnp
+import numpy as np
+# from flax.linen.partitioning import param_with_axes, with_sharding_constraint
+param_with_axes = nn_partitioning.param_with_axes
+with_sharding_constraint = nn_partitioning.with_sharding_constraint
+# Type annotations
+Array = jnp.ndarray
+DType = jnp.dtype
+PRNGKey = jnp.ndarray
+Shape = Iterable[int]
+Activation = Callable[..., Array]
+# Parameter initializers.
+Initializer = Callable[[PRNGKey, Shape, DType], Array]
+default_embed_init = nn.initializers.variance_scaling(
+    1.0, 'fan_in', 'normal', out_axis=0)
+def sinusoidal(min_scale: float = 1.0,
+               max_scale: float = 10000.0,
+               dtype: DType = jnp.float32) -> Initializer:
+  """Creates 1D Sinusoidal Position Embedding Initializer.
+  Args:
+    min_scale: Minimum frequency-scale in sine grating.
+    max_scale: Maximum frequency-scale in sine grating.
+    dtype: The DType of the returned values.
+  Returns:
+    The sinusoidal initialization function.
+  """
+  def init(key: PRNGKey, shape: Shape, dtype: DType = dtype) -> Array:
+    """Sinusoidal init."""
+    del key
+    if dtype != np.float32:
+      raise ValueError('The sinusoidal initializer only supports float32.')
+    if len(list(shape)) != 2:
+      raise ValueError(
+          f'Expected a 2D shape (max_len, features), but got {shape}.')
+    max_len, features = shape
+    pe = np.zeros((max_len, features), dtype=dtype)
+    position = np.arange(0, max_len)[:, np.newaxis]
+    scale_factor = -np.log(max_scale / min_scale) / (features // 2 - 1)
+    div_term = min_scale * np.exp(np.arange(0, features // 2) * scale_factor)
+    pe[:, :features // 2] = np.sin(position * div_term)
+    pe[:, features // 2:2 * (features // 2)] = np.cos(position * div_term)
+    return jnp.array(pe)
+  return init
+def dot_product_attention(query: Array,
+                          key: Array,
+                          value: Array,
+                          bias: Optional[Array] = None,
+                          dropout_rng: Optional[PRNGKey] = None,
+                          dropout_rate: float = 0.,
+                          deterministic: bool = False,
+                          dtype: DType = jnp.float32,
+                          float32_logits: bool = False):
+  """Computes dot-product attention given query, key, and value.
+  This is the core function for applying attention based on
+  https://arxiv.org/abs/1706.03762. It calculates the attention weights given
+  query and key and combines the values using the attention weights.
+  Args:
+    query: queries for calculating attention with shape of `[batch, q_length,
+      num_heads, qk_depth_per_head]`.
+    key: keys for calculating attention with shape of `[batch, kv_length,
+      num_heads, qk_depth_per_head]`.
+    value: values to be used in attention with shape of `[batch, kv_length,
+      num_heads, v_depth_per_head]`.
+    bias: bias for the attention weights. This should be broadcastable to the
+      shape `[batch, num_heads, q_length, kv_length]` This can be used for
+      incorporating causal masks, padding masks, proximity bias, etc.
+    dropout_rng: JAX PRNGKey: to be used for dropout
+    dropout_rate: dropout rate
+    deterministic: bool, deterministic or not (to apply dropout)
+    dtype: the dtype of the computation (default: float32)
+    float32_logits: bool, if True then compute logits in float32 to avoid
+      numerical issues with bfloat16.
+  Returns:
+    Output of shape `[batch, length, num_heads, v_depth_per_head]`.
+  """
+  assert key.ndim == query.ndim == value.ndim, 'q, k, v must have same rank.'
+  assert query.shape[:-3] == key.shape[:-3] == value.shape[:-3], (
+      'q, k, v batch dims must match.')
+  assert query.shape[-2] == key.shape[-2] == value.shape[-2], (
+      'q, k, v num_heads must match.')
+  assert key.shape[-3] == value.shape[-3], 'k, v lengths must match.'
+  assert query.shape[-1] == key.shape[-1], 'q, k depths must match.'
+  # Casting logits and softmax computation for float32 for model stability.
+  if float32_logits:
+    query = query.astype(jnp.float32)
+    key = key.astype(jnp.float32)
+  # `attn_weights`: [batch, num_heads, q_length, kv_length]
+  attn_weights = jnp.einsum('bqhd,bkhd->bhqk', query, key)
+  # Apply attention bias: masking, dropout, proximity bias, etc.
+  if bias is not None:
+    attn_weights = attn_weights + bias.astype(attn_weights.dtype)
+  # Normalize the attention weights across `kv_length` dimension.
+  attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+  # Apply attention dropout.
+  if not deterministic and dropout_rate > 0.:
+    keep_prob = 1.0 - dropout_rate
+    # T5 broadcasts along the "length" dim, but unclear which one that
+    # corresponds to in positional dimensions here, assuming query dim.
+    dropout_shape = list(attn_weights.shape)
+    dropout_shape[-2] = 1
+    keep = random.bernoulli(dropout_rng, keep_prob, dropout_shape)
+    keep = jnp.broadcast_to(keep, attn_weights.shape)
+    multiplier = (
+        keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype))
+    attn_weights = attn_weights * multiplier
+  # Take the linear combination of `value`.
+  return jnp.einsum('bhqk,bkhd->bqhd', attn_weights, value)
+dynamic_vector_slice_in_dim = jax.vmap(
+    lax.dynamic_slice_in_dim, in_axes=(None, 0, None, None))
+class MultiHeadDotProductAttention(nn.Module):
+  """Multi-head dot-product attention.
+    Attributes:
+      num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
+        should be divisible by the number of heads.
+      head_dim: dimension of each head.
+      dtype: the dtype of the computation.
+      dropout_rate: dropout rate
+      kernel_init: initializer for the kernel of the Dense layers.
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+  """
+  num_heads: int
+  head_dim: int
+  dtype: DType = jnp.float32
+  dropout_rate: float = 0.
+  kernel_init: Initializer = nn.initializers.variance_scaling(
+      1.0, 'fan_in', 'normal')
+  float32_logits: bool = False  # computes logits in float32 for stability.
+  @nn.compact
+  def __call__(self,
+               inputs_q: Array,
+               inputs_kv: Array,
+               mask: Optional[Array] = None,
+               bias: Optional[Array] = None,
+               *,
+               decode: bool = False,
+               deterministic: bool = False) -> Array:
+    """Applies multi-head dot product attention on the input data.
+    Projects the inputs into multi-headed query, key, and value vectors,
+    applies dot-product attention and project the results to an output vector.
+    There are two modes: decoding and non-decoding (e.g., training). The mode is
+    determined by `decode` argument. For decoding, this method is called twice,
+    first to initialize the cache and then for an actual decoding process. The
+    two calls are differentiated by the presence of 'cached_key' in the variable
+    dict. In the cache initialization stage, the cache variables are initialized
+    as zeros and will be filled in the subsequent decoding process.
+    In the cache initialization call, `inputs_q` has a shape [batch, length,
+    q_features] and `inputs_kv`: [batch, length, kv_features]. During the
+    incremental decoding stage, query, key and value all have the shape [batch,
+    1, qkv_features] corresponding to a single step.
+    Args:
+      inputs_q: input queries of shape `[batch, q_length, q_features]`.
+      inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
+      mask: attention mask of shape `[batch, num_heads, q_length, kv_length]`.
+      bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
+      decode: Whether to prepare and use an autoregressive cache.
+      deterministic: Disables dropout if set to True.
+    Returns:
+      output of shape `[batch, length, q_features]`.
+    """
+    projection = functools.partial(
+        DenseGeneral,
+        axis=-1,
+        features=(self.num_heads, self.head_dim),
+        kernel_axes=('embed', 'joined_kv'),
+        dtype=self.dtype)
+    # NOTE: T5 does not explicitly rescale the attention logits by
+    #       1/sqrt(depth_kq)!  This is folded into the initializers of the
+    #       linear transformations, which is equivalent under Adafactor.
+    depth_scaling = jnp.sqrt(self.head_dim).astype(self.dtype)
+    query_init = lambda *args: self.kernel_init(*args) / depth_scaling
+    # Project inputs_q to multi-headed q/k/v
+    # dimensions are then [batch, length, num_heads, head_dim]
+    query = projection(kernel_init=query_init, name='query')(inputs_q)
+    key = projection(kernel_init=self.kernel_init, name='key')(inputs_kv)
+    value = projection(kernel_init=self.kernel_init, name='value')(inputs_kv)
+    query = with_sharding_constraint(query, ('batch', 'length', 'heads', 'kv'))
+    key = with_sharding_constraint(key, ('batch', 'length', 'heads', 'kv'))
+    value = with_sharding_constraint(value, ('batch', 'length', 'heads', 'kv'))
+    if decode:
+      # Detect if we're initializing by absence of existing cache data.
+      is_initialized = self.has_variable('cache', 'cached_key')
+      # The key and value have dimension [batch, length, num_heads, head_dim],
+      # but we cache them as [batch, num_heads, head_dim, length] as a TPU
+      # fusion optimization. This also enables the "scatter via one-hot
+      # broadcast" trick, which means we do a one-hot broadcast instead of a
+      # scatter/gather operations, resulting in a 3-4x speedup in practice.
+      swap_dims = lambda x: x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+      cached_key = self.variable('cache', 'cached_key', jnp.zeros,
+                                 swap_dims(key.shape), key.dtype)
+      cached_value = self.variable('cache', 'cached_value', jnp.zeros,
+                                   swap_dims(value.shape), value.dtype)
+      cache_index = self.variable('cache', 'cache_index',
+                                  lambda: jnp.array(0, dtype=jnp.int32))
+      if is_initialized:
+        batch, num_heads, head_dim, length = (cached_key.value.shape)
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        # Sanity shape check of cached key against input query.
+        expected_shape = (batch, 1, num_heads, head_dim)
+        if expected_shape != query.shape:
+          raise ValueError('Autoregressive cache shape error, '
+                           'expected query shape %s instead got %s.' %
+                           (expected_shape, query.shape))
+        # Create a OHE of the current index. NOTE: the index is increased below.
+        cur_index = cache_index.value
+        one_hot_indices = jax.nn.one_hot(cur_index, length, dtype=key.dtype)
+        # In order to update the key, value caches with the current key and
+        # value, we move the length axis to the back, similar to what we did for
+        # the cached ones above.
+        # Note these are currently the key and value of a single position, since
+        # we feed one position at a time.
+        one_token_key = jnp.moveaxis(key, -3, -1)
+        one_token_value = jnp.moveaxis(value, -3, -1)
+        # Update key, value caches with our new 1d spatial slices.
+        # We implement an efficient scatter into the cache via one-hot
+        # broadcast and addition.
+        key = cached_key.value + one_token_key * one_hot_indices
+        value = cached_value.value + one_token_value * one_hot_indices
+        cached_key.value = key
+        cached_value.value = value
+        cache_index.value = cache_index.value + 1
+        # Move the keys and values back to their original shapes.
+        key = jnp.moveaxis(key, -1, -3)
+        value = jnp.moveaxis(value, -1, -3)
+        # Causal mask for cached decoder self-attention: our single query
+        # position should only attend to those key positions that have already
+        # been generated and cached, not the remaining zero elements.
+        mask = combine_masks(
+            mask,
+            jnp.broadcast_to(
+                jnp.arange(length) <= cur_index,
+                # (1, 1, length) represent (head dim, query length, key length)
+                # query length is 1 because during decoding we deal with one
+                # index.
+                # The same mask is applied to all batch elements and heads.
+                (batch, 1, 1, length)))
+        # Grab the correct relative attention bias during decoding. This is
+        # only required during single step decoding.
+        if bias is not None:
+          # The bias is a full attention matrix, but during decoding we only
+          # have to take a slice of it.
+          # This is equivalent to bias[..., cur_index:cur_index+1, :].
+          bias = dynamic_vector_slice_in_dim(
+              jnp.squeeze(bias, axis=0), jnp.reshape(cur_index, (-1)), 1, -2)
+    # Convert the boolean attention mask to an attention bias.
+    if mask is not None:
+      # attention mask in the form of attention bias
+      attention_bias = lax.select(
+          mask > 0,
+          jnp.full(mask.shape, 0.).astype(self.dtype),
+          jnp.full(mask.shape, -1e10).astype(self.dtype))
+    else:
+      attention_bias = None
+    # Add provided bias term (e.g. relative position embedding).
+    if bias is not None:
+      attention_bias = combine_biases(attention_bias, bias)
+    dropout_rng = None
+    if not deterministic and self.dropout_rate > 0.:
+      dropout_rng = self.make_rng('dropout')
+    # Apply attention.
+    x = dot_product_attention(
+        query,
+        key,
+        value,
+        bias=attention_bias,
+        dropout_rng=dropout_rng,
+        dropout_rate=self.dropout_rate,
+        deterministic=deterministic,
+        dtype=self.dtype,
+        float32_logits=self.float32_logits)
+    # Back to the original inputs dimensions.
+    out = DenseGeneral(
+        features=inputs_q.shape[-1],  # output dim is set to the input dim.
+        axis=(-2, -1),
+        kernel_init=self.kernel_init,
+        kernel_axes=('joined_kv', 'embed'),
+        dtype=self.dtype,
+        name='out')(
+            x)
+    return out
+def _normalize_axes(axes: Iterable[int], ndim: int) -> Tuple[int]:
+  # A tuple by convention. len(axes_tuple) then also gives the rank efficiently.
+  return tuple([ax if ax >= 0 else ndim + ax for ax in axes])
+def _canonicalize_tuple(x):
+  if isinstance(x, Iterable):
+    return tuple(x)
+  else:
+    return (x,)
+#------------------------------------------------------------------------------
+# DenseGeneral for attention layers.
+#------------------------------------------------------------------------------
+class DenseGeneral(nn.Module):
+  """A linear transformation (without bias) with flexible axes.
+    Attributes:
+      features: tuple with numbers of output features.
+      axis: tuple with axes to apply the transformation on.
+      dtype: the dtype of the computation (default: float32).
+      kernel_init: initializer function for the weight matrix.
+  """
+  features: Union[Iterable[int], int]
+  axis: Union[Iterable[int], int] = -1
+  dtype: DType = jnp.float32
+  kernel_init: Initializer = nn.initializers.variance_scaling(
+      1.0, 'fan_in', 'truncated_normal')
+  kernel_axes: Tuple[str, ...] = ()
+  @nn.compact
+  def __call__(self, inputs: Array) -> Array:
+    """Applies a linear transformation to the inputs along multiple dimensions.
+    Args:
+      inputs: The nd-array to be transformed.
+    Returns:
+      The transformed input.
+    """
+    features = _canonicalize_tuple(self.features)
+    axis = _canonicalize_tuple(self.axis)
+    inputs = jnp.asarray(inputs, self.dtype)
+    axis = _normalize_axes(axis, inputs.ndim)
+    kernel_shape = tuple([inputs.shape[ax] for ax in axis]) + features
+    kernel_param_shape = (np.prod([inputs.shape[ax] for ax in axis]),
+                          np.prod(features))
+    kernel = param_with_axes(
+        'kernel',
+        self.kernel_init,
+        kernel_param_shape,
+        jnp.float32,
+        axes=self.kernel_axes)
+    kernel = jnp.asarray(kernel, self.dtype)
+    kernel = jnp.reshape(kernel, kernel_shape)
+    contract_ind = tuple(range(0, len(axis)))
+    return lax.dot_general(inputs, kernel, ((axis, contract_ind), ((), ())))
+def _convert_to_activation_function(
+    fn_or_string: Union[str, Callable]) -> Callable:
+  """Convert a string to an activation function."""
+  if fn_or_string == 'linear':
+    return lambda x: x
+  elif isinstance(fn_or_string, str):
+    return getattr(nn, fn_or_string)
+  elif callable(fn_or_string):
+    return fn_or_string
+  else:
+    raise ValueError("don't know how to convert %s to an activation function" %
+                     (fn_or_string,))
+class MlpBlock(nn.Module):
+  """Transformer MLP / feed-forward block.
+  Attributes:
+    intermediate_dim: Shared dimension of hidden layers.
+    activations: Type of activations for each layer.  Each element is either
+      'linear', a string function name in flax.linen, or a function.
+    kernel_init: Kernel function, passed to the dense layers.
+    deterministic: Whether the dropout layers should be deterministic.
+    intermediate_dropout_rate: Dropout rate used after the intermediate layers.
+    dtype: Type for the dense layer.
+  """
+  intermediate_dim: int = 2048
+  activations: Sequence[Union[str, Callable]] = ('relu',)
+  kernel_init: Initializer = nn.initializers.variance_scaling(
+      1.0, 'fan_in', 'truncated_normal')
+  intermediate_dropout_rate: float = 0.1
+  dtype: Any = jnp.float32
+  @nn.compact
+  def __call__(self, inputs, decode: bool = False, deterministic: bool = False):
+    """Applies Transformer MlpBlock module."""
+    # Iterate over specified MLP input activation functions.
+    # e.g. ('relu',) or ('gelu', 'linear') for gated-gelu.
+    activations = []
+    for idx, act_fn in enumerate(self.activations):
+      dense_name = 'wi' if len(self.activations) == 1 else f'wi_{idx}'
+      x = DenseGeneral(
+          self.intermediate_dim,
+          dtype=self.dtype,
+          kernel_init=self.kernel_init,
+          kernel_axes=('embed', 'mlp'),
+          name=dense_name)(
+              inputs)
+      x = _convert_to_activation_function(act_fn)(x)
+      activations.append(x)
+    # Take elementwise product of above intermediate activations.
+    x = functools.reduce(operator.mul, activations)
+    # Apply dropout and final dense output projection.
+    x = nn.Dropout(
+        rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic)  # Broadcast along length.
+    x = with_sharding_constraint(x, ('batch', 'length', 'mlp'))
+    output = DenseGeneral(
+        inputs.shape[-1],
+        dtype=self.dtype,
+        kernel_init=self.kernel_init,
+        kernel_axes=('mlp', 'embed'),
+        name='wo')(
+            x)
+    return output
+class Embed(nn.Module):
+  """A parameterized function from integers [0, n) to d-dimensional vectors.
+  Attributes:
+    num_embeddings: number of embeddings.
+    features: number of feature dimensions for each embedding.
+    dtype: the dtype of the embedding vectors (default: float32).
+    embedding_init: embedding initializer.
+    one_hot: performs the gather with a one-hot contraction rather than a true
+      gather. This is currently needed for SPMD partitioning.
+  """
+  num_embeddings: int
+  features: int
+  cast_input_dtype: Optional[DType] = None
+  dtype: DType = jnp.float32
+  attend_dtype: Optional[DType] = None
+  embedding_init: Initializer = default_embed_init
+  one_hot: bool = False
+  embedding: Array = dataclasses.field(init=False)
+  def setup(self):
+    self.embedding = param_with_axes(
+        'embedding',
+        self.embedding_init, (self.num_embeddings, self.features),
+        jnp.float32,
+        axes=('vocab', 'embed'))
+  def __call__(self, inputs: Array) -> Array:
+    """Embeds the inputs along the last dimension.
+    Args:
+      inputs: input data, all dimensions are considered batch dimensions.
+    Returns:
+      Output which is embedded input data.  The output shape follows the input,
+      with an additional `features` dimension appended.
+    """
+    if self.cast_input_dtype:
+      inputs = inputs.astype(self.cast_input_dtype)
+    if not jnp.issubdtype(inputs.dtype, jnp.integer):
+      raise ValueError('Input type must be an integer or unsigned integer.')
+    if self.one_hot:
+      iota = lax.iota(jnp.int32, self.num_embeddings)
+      one_hot = jnp.array(inputs[..., jnp.newaxis] == iota, dtype=self.dtype)
+      output = jnp.dot(one_hot, jnp.asarray(self.embedding, self.dtype))
+    else:
+      output = jnp.asarray(self.embedding, self.dtype)[inputs]
+      output = with_sharding_constraint(output, ('batch', 'length', 'embed'))
+    return output
+  def attend(self, query: Array) -> Array:
+    """Attend over the embedding using a query array.
+    Args:
+      query: array with last dimension equal the feature depth `features` of the
+        embedding.
+    Returns:
+      An array with final dim `num_embeddings` corresponding to the batched
+      inner-product of the array of query vectors against each embedding.
+      Commonly used for weight-sharing between embeddings and logit transform
+      in NLP models.
+    """
+    dtype = self.attend_dtype if self.attend_dtype is not None else self.dtype
+    return jnp.dot(query, jnp.asarray(self.embedding, dtype).T)
+class FixedEmbed(nn.Module):
+  """Fixed (not learnable) embeddings specified by the initializer function.
+  Attributes:
+    init_fn: The initializer function that defines the embeddings.
+    max_length: The maximum supported length.
+    dtype: The DType to use for the embeddings.
+  """
+  features: int
+  max_length: int = 2048
+  embedding_init: Initializer = sinusoidal()
+  dtype: jnp.dtype = jnp.float32
+  def setup(self):
+    # The key is set to None because sinusoid init is deterministic.
+    shape = (self.max_length, self.features)
+    self.embedding = self.embedding_init(None, shape, self.dtype)  # pylint: disable=too-many-function-args
+  @nn.compact
+  def __call__(self,
+               inputs,
+               *,
+               decode: bool = False):
+    """Returns the fixed position embeddings specified by the initializer.
+    Args:
+      inputs: <int>[batch_size, seq_len] input position indices.
+      decode: True if running in single-position autoregressive decode mode.
+    Returns:
+      The fixed position embeddings <float32>[batch_size, seq_len, features].
+    """
+    # We use a cache position index for tracking decoding position.
+    if decode:
+      position_embedder_index = self.variable(
+          'cache', 'position_embedder_index',
+          lambda: jnp.array(-1, dtype=jnp.uint32))
+      i = position_embedder_index.value
+      position_embedder_index.value = i + 1
+      return jax.lax.dynamic_slice(self.embedding, jnp.array((i, 0)),
+                                   np.array((1, self.features)))
+    return jnp.take(self.embedding, inputs, axis=0)
+#------------------------------------------------------------------------------
+# T5 Layernorm - no subtraction of mean or bias.
+#------------------------------------------------------------------------------
+class LayerNorm(nn.Module):
+  """T5 Layer normalization operating on the last axis of the input data."""
+  epsilon: float = 1e-6
+  dtype: Any = jnp.float32
+  scale_init: Initializer = nn.initializers.ones
+  @nn.compact
+  def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+    """Applies layer normalization on the input."""
+    x = jnp.asarray(x, jnp.float32)
+    features = x.shape[-1]
+    mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+    y = jnp.asarray(x * lax.rsqrt(mean2 + self.epsilon), self.dtype)
+    scale = param_with_axes(
+        'scale', self.scale_init, (features,), jnp.float32, axes=('embed',))
+    scale = jnp.asarray(scale, self.dtype)
+    return y * scale
+#------------------------------------------------------------------------------
+# Mask-making utility functions.
+#------------------------------------------------------------------------------
+def make_attention_mask(query_input: Array,
+                        key_input: Array,
+                        pairwise_fn: Callable = jnp.multiply,
+                        extra_batch_dims: int = 0,
+                        dtype: DType = jnp.float32) -> Array:
+  """Mask-making helper for attention weights.
+  In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
+  attention weights will be `[batch, heads, len_q, len_kv]` and this
+  function will produce `[batch, 1, len_q, len_kv]`.
+  Args:
+    query_input: a batched, flat input of query_length size
+    key_input: a batched, flat input of key_length size
+    pairwise_fn: broadcasting elementwise comparison function
+    extra_batch_dims: number of extra batch dims to add singleton axes for, none
+      by default
+    dtype: mask return dtype
+  Returns:
+    A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
+  """
+  # [batch, len_q, len_kv]
+  mask = pairwise_fn(
+      # [batch, len_q] -> [batch, len_q, 1]
+      jnp.expand_dims(query_input, axis=-1),
+      # [batch, len_q] -> [batch, 1, len_kv]
+      jnp.expand_dims(key_input, axis=-2))
+  # [batch, 1, len_q, len_kv]. This creates the head dim.
+  mask = jnp.expand_dims(mask, axis=-3)
+  mask = jnp.expand_dims(mask, axis=tuple(range(extra_batch_dims)))
+  return mask.astype(dtype)
+def make_causal_mask(x: Array,
+                     extra_batch_dims: int = 0,
+                     dtype: DType = jnp.float32) -> Array:
+  """Make a causal mask for self-attention.
+  In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
+  will be `[batch, heads, len, len]` and this function will produce a
+  causal mask of shape `[batch, 1, len, len]`.
+  Note that a causal mask does not depend on the values of x; it only depends on
+  the shape. If x has padding elements, they will not be treated in a special
+  manner.
+  Args:
+    x: input array of shape `[batch, len]`
+    extra_batch_dims: number of batch dims to add singleton axes for, none by
+      default
+    dtype: mask return dtype
+  Returns:
+    A `[batch, 1, len, len]` shaped causal mask for 1d attention.
+  """
+  idxs = jnp.broadcast_to(jnp.arange(x.shape[-1], dtype=jnp.int32), x.shape)
+  return make_attention_mask(
+      idxs,
+      idxs,
+      jnp.greater_equal,
+      extra_batch_dims=extra_batch_dims,
+      dtype=dtype)
+def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
+  """Combine attention masks.
+  Args:
+    *masks: set of attention mask arguments to combine, some can be None.
+    dtype: final mask dtype
+  Returns:
+    Combined mask, reduced by logical and, returns None if no masks given.
+  """
+  masks = [m for m in masks if m is not None]
+  if not masks:
+    return None
+  assert all(map(lambda x: x.ndim == masks[0].ndim, masks)), (
+      f'masks must have same rank: {tuple(map(lambda x: x.ndim, masks))}')
+  mask, *other_masks = masks
+  for other_mask in other_masks:
+    mask = jnp.logical_and(mask, other_mask)
+  return mask.astype(dtype)
+def combine_biases(*masks: Optional[Array]):
+  """Combine attention biases.
+  Args:
+    *masks: set of attention bias arguments to combine, some can be None.
+  Returns:
+    Combined mask, reduced by summation, returns None if no masks given.
+  """
+  masks = [m for m in masks if m is not None]
+  if not masks:
+    return None
+  assert all(map(lambda x: x.ndim == masks[0].ndim, masks)), (
+      f'masks must have same rank: {tuple(map(lambda x: x.ndim, masks))}')
+  mask, *other_masks = masks
+  for other_mask in other_masks:
+    mask = mask + other_mask
+  return mask
+def make_decoder_mask(decoder_target_tokens: Array,
+                      dtype: DType,
+                      decoder_causal_attention: Optional[Array] = None,
+                      decoder_segment_ids: Optional[Array] = None) -> Array:
+  """Compute the self-attention mask for a decoder.
+  Decoder mask is formed by combining a causal mask, a padding mask and an
+  optional packing mask. If decoder_causal_attention is passed, it makes the
+  masking non-causal for positions that have value of 1.
+  A prefix LM is applied to a dataset which has a notion of "inputs" and
+  "targets", e.g., a machine translation task. The inputs and targets are
+  concatenated to form a new target. `decoder_target_tokens` is the concatenated
+  decoder output tokens.
+  The "inputs" portion of the concatenated sequence can attend to other "inputs"
+  tokens even for those at a later time steps. In order to control this
+  behavior, `decoder_causal_attention` is necessary. This is a binary mask with
+  a value of 1 indicating that the position belonged to "inputs" portion of the
+  original dataset.
+  Example:
+    Suppose we have a dataset with two examples.
+    ds = [{"inputs": [6, 7], "targets": [8]},
+          {"inputs": [3, 4], "targets": [5]}]
+    After the data preprocessing with packing, the two examples are packed into
+    one example with the following three fields (some fields are skipped for
+    simplicity).
+       decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
+         decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
+    decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
+    where each array has [batch, length] shape with batch size being 1. Then,
+    this function computes the following mask.
+                      mask = [[[[1, 1, 0, 0, 0, 0, 0],
+                                [1, 1, 0, 0, 0, 0, 0],
+                                [1, 1, 1, 0, 0, 0, 0],
+                                [0, 0, 0, 1, 1, 0, 0],
+                                [0, 0, 0, 1, 1, 0, 0],
+                                [0, 0, 0, 1, 1, 1, 0],
+                                [0, 0, 0, 0, 0, 0, 0]]]]
+    mask[b, 1, :, :] represents the mask for the example `b` in the batch.
+    Because mask is for a self-attention layer, the mask's shape is a square of
+    shape [query length, key length].
+    mask[b, 1, i, j] = 1 means that the query token at position i can attend to
+    the key token at position j.
+  Args:
+    decoder_target_tokens: decoder output tokens. [batch, length]
+    dtype: dtype of the output mask.
+    decoder_causal_attention: a binary mask indicating which position should
+      only attend to earlier positions in the sequence. Others will attend
+      bidirectionally. [batch, length]
+    decoder_segment_ids: decoder segmentation info for packed examples. [batch,
+      length]
+  Returns:
+    the combined decoder mask.
+  """
+  masks = []
+  # The same mask is applied to all attention heads. So the head dimension is 1,
+  # i.e., the mask will be broadcast along the heads dim.
+  # [batch, 1, length, length]
+  causal_mask = make_causal_mask(decoder_target_tokens, dtype=dtype)
+  # Positions with value 1 in `decoder_causal_attneition` can attend
+  # bidirectionally.
+  if decoder_causal_attention is not None:
+    # [batch, 1, length, length]
+    inputs_mask = make_attention_mask(
+        decoder_causal_attention,
+        decoder_causal_attention,
+        jnp.logical_and,
+        dtype=dtype)
+    masks.append(jnp.logical_or(causal_mask, inputs_mask).astype(dtype))
+  else:
+    masks.append(causal_mask)
+  # Padding mask.
+  masks.append(
+      make_attention_mask(
+          decoder_target_tokens > 0, decoder_target_tokens > 0, dtype=dtype))
+  # Packing mask
+  if decoder_segment_ids is not None:
+    masks.append(
+        make_attention_mask(
+            decoder_segment_ids, decoder_segment_ids, jnp.equal, dtype=dtype))
+  return combine_masks(*masks, dtype=dtype)

mt3/layers_test.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for attention classes."""
+import dataclasses
+from typing import Optional
+from unittest import mock
+from absl.testing import absltest
+from absl.testing import parameterized
+from flax import linen as nn
+from flax.core import freeze
+from flax.linen import partitioning as nn_partitioning
+import jax
+from jax import random
+from jax.nn import initializers
+import jax.numpy as jnp
+from mt3 import layers
+import numpy as np
+# Parse absl flags test_srcdir and test_tmpdir.
+jax.config.parse_flags_with_absl()
+Array = jnp.ndarray
+AxisMetadata = nn_partitioning.AxisMetadata  # pylint: disable=invalid-name
+class SelfAttention(layers.MultiHeadDotProductAttention):
+  """Self-attention special case of multi-head dot-product attention."""
+  @nn.compact
+  def __call__(self,
+               inputs_q: Array,
+               mask: Optional[Array] = None,
+               bias: Optional[Array] = None,
+               deterministic: bool = False):
+    return super().__call__(
+        inputs_q, inputs_q, mask, bias, deterministic=deterministic)
+@dataclasses.dataclass(frozen=True)
+class SelfAttentionArgs:
+  num_heads: int = 1
+  batch_size: int = 2
+  # qkv_features: int = 3
+  head_dim: int = 3
+  # out_features: int = 4
+  q_len: int = 5
+  features: int = 6
+  dropout_rate: float = 0.1
+  deterministic: bool = False
+  decode: bool = False
+  float32_logits: bool = False
+  def __post_init__(self):
+    # If we are doing decoding, the query length should be 1, because are doing
+    # autoregressive decoding where we feed one position at a time.
+    assert not self.decode or self.q_len == 1
+  def init_args(self):
+    return dict(
+        num_heads=self.num_heads,
+        head_dim=self.head_dim,
+        dropout_rate=self.dropout_rate,
+        float32_logits=self.float32_logits)
+  def apply_args(self):
+    inputs_q = jnp.ones((self.batch_size, self.q_len, self.features))
+    mask = jnp.ones((self.batch_size, self.num_heads, self.q_len, self.q_len))
+    bias = jnp.ones((self.batch_size, self.num_heads, self.q_len, self.q_len))
+    return {
+        'inputs_q': inputs_q,
+        'mask': mask,
+        'bias': bias,
+        'deterministic': self.deterministic
+    }
+class AttentionTest(parameterized.TestCase):
+  def test_dot_product_attention_shape(self):
+    # This test only checks for shape but tries to make sure all code paths are
+    # reached.
+    dropout_rng = random.PRNGKey(0)
+    batch_size, num_heads, q_len, kv_len, qk_depth, v_depth = 1, 2, 3, 4, 5, 6
+    query = jnp.ones((batch_size, q_len, num_heads, qk_depth))
+    key = jnp.ones((batch_size, kv_len, num_heads, qk_depth))
+    value = jnp.ones((batch_size, kv_len, num_heads, v_depth))
+    bias = jnp.ones((batch_size, num_heads, q_len, kv_len))
+    args = dict(
+        query=query,
+        key=key,
+        value=value,
+        bias=bias,
+        dropout_rng=dropout_rng,
+        dropout_rate=0.5,
+        deterministic=False,
+    )
+    output = layers.dot_product_attention(**args)
+    self.assertEqual(output.shape, (batch_size, q_len, num_heads, v_depth))
+  def test_make_attention_mask_multiply_pairwise_fn(self):
+    decoder_target_tokens = jnp.array([[7, 0, 0], [8, 5, 0]])
+    attention_mask = layers.make_attention_mask(
+        decoder_target_tokens > 0, decoder_target_tokens > 0, dtype=jnp.int32)
+    expected0 = jnp.array([[1, 0, 0], [0, 0, 0], [0, 0, 0]])
+    expected1 = jnp.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]])
+    self.assertEqual(attention_mask.shape, (2, 1, 3, 3))
+    np.testing.assert_array_equal(attention_mask[0, 0], expected0)
+    np.testing.assert_array_equal(attention_mask[1, 0], expected1)
+  def test_make_attention_mask_equal_pairwise_fn(self):
+    segment_ids = jnp.array([[1, 1, 2, 2, 2, 0], [1, 1, 1, 2, 0, 0]])
+    attention_mask = layers.make_attention_mask(
+        segment_ids, segment_ids, pairwise_fn=jnp.equal, dtype=jnp.int32)
+    # Padding is not treated in a special way. So they need to be zeroed out
+    # separately.
+    expected0 = jnp.array([[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0],
+                           [0, 0, 1, 1, 1, 0], [0, 0, 1, 1, 1, 0],
+                           [0, 0, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]])
+    expected1 = jnp.array([[1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0],
+                           [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0],
+                           [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1]])
+    self.assertEqual(attention_mask.shape, (2, 1, 6, 6))
+    np.testing.assert_array_equal(attention_mask[0, 0], expected0)
+    np.testing.assert_array_equal(attention_mask[1, 0], expected1)
+  def test_make_causal_mask_with_padding(self):
+    x = jnp.array([[7, 0, 0], [8, 5, 0]])
+    y = layers.make_causal_mask(x)
+    self.assertEqual(y.shape, (2, 1, 3, 3))
+    # Padding is not treated in a special way. So they need to be zeroed out
+    # separately.
+    expected_y = jnp.array([[[1., 0., 0.], [1., 1., 0.], [1., 1., 1.]]],
+                           jnp.float32)
+    np.testing.assert_allclose(y[0], expected_y)
+    np.testing.assert_allclose(y[1], expected_y)
+  def test_make_causal_mask_extra_batch_dims(self):
+    x = jnp.ones((3, 3, 5))
+    y = layers.make_causal_mask(x, extra_batch_dims=2)
+    self.assertEqual(y.shape, (1, 1, 3, 3, 1, 5, 5))
+  def test_make_causal_mask(self):
+    x = jnp.ones((1, 3))
+    y = layers.make_causal_mask(x)
+    self.assertEqual(y.shape, (1, 1, 3, 3))
+    expected_y = jnp.array([[[[1., 0., 0.], [1., 1., 0.], [1., 1., 1.]]]],
+                           jnp.float32)
+    np.testing.assert_allclose(y, expected_y)
+  def test_combine_masks(self):
+    masks = [
+        jnp.array([0, 1, 0, 1], jnp.float32), None,
+        jnp.array([1, 1, 1, 1], jnp.float32),
+        jnp.array([1, 1, 1, 0], jnp.float32)
+    ]
+    y = layers.combine_masks(*masks)
+    np.testing.assert_allclose(y, jnp.array([0, 1, 0, 0], jnp.float32))
+  def test_combine_biases(self):
+    masks = [
+        jnp.array([0, 1, 0, 1], jnp.float32), None,
+        jnp.array([0, 1, 1, 1], jnp.float32),
+        jnp.array([0, 1, 1, 0], jnp.float32)
+    ]
+    y = layers.combine_biases(*masks)
+    np.testing.assert_allclose(y, jnp.array([0, 3, 2, 2], jnp.float32))
+  def test_make_decoder_mask_lm_unpacked(self):
+    decoder_target_tokens = jnp.array([6, 7, 3, 0])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens, dtype=jnp.float32)
+    expected_mask = jnp.array([[[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0],
+                                [0, 0, 0, 0]]])
+    np.testing.assert_array_equal(mask, expected_mask)
+  def test_make_decoder_mask_lm_packed(self):
+    decoder_target_tokens = jnp.array([[6, 7, 3, 4, 5, 0]])
+    decoder_segment_ids = jnp.array([[1, 1, 1, 2, 2, 0]])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens,
+        dtype=jnp.float32,
+        decoder_segment_ids=decoder_segment_ids)
+    expected_mask = jnp.array([[[[1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0],
+                                 [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0],
+                                 [0, 0, 0, 1, 1, 0], [0, 0, 0, 0, 0, 0]]]])
+    np.testing.assert_array_equal(mask, expected_mask)
+  def test_make_decoder_mask_prefix_lm_unpacked(self):
+    decoder_target_tokens = jnp.array([[5, 6, 7, 3, 4, 0]])
+    decoder_causal_attention = jnp.array([[1, 1, 1, 0, 0, 0]])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens,
+        dtype=jnp.float32,
+        decoder_causal_attention=decoder_causal_attention)
+    expected_mask = jnp.array(
+        [[[[1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0],
+           [1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0]]]],
+        dtype=jnp.float32)
+    np.testing.assert_array_equal(mask, expected_mask)
+  def test_make_decoder_mask_prefix_lm_packed(self):
+    decoder_target_tokens = jnp.array([[5, 6, 7, 8, 3, 4, 0]])
+    decoder_segment_ids = jnp.array([[1, 1, 1, 2, 2, 2, 0]])
+    decoder_causal_attention = jnp.array([[1, 1, 0, 1, 1, 0, 0]])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens,
+        dtype=jnp.float32,
+        decoder_causal_attention=decoder_causal_attention,
+        decoder_segment_ids=decoder_segment_ids)
+    expected_mask = jnp.array([[[[1, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0],
+                                 [1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0],
+                                 [0, 0, 0, 1, 1, 0, 0], [0, 0, 0, 1, 1, 1, 0],
+                                 [0, 0, 0, 0, 0, 0, 0]]]])
+    np.testing.assert_array_equal(mask, expected_mask)
+  def test_make_decoder_mask_prefix_lm_unpacked_multiple_elements(self):
+    decoder_target_tokens = jnp.array([[6, 7, 3, 0], [4, 5, 0, 0]])
+    decoder_causal_attention = jnp.array([[1, 1, 0, 0], [1, 0, 0, 0]])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens,
+        dtype=jnp.float32,
+        decoder_causal_attention=decoder_causal_attention)
+    expected_mask0 = jnp.array([[1, 1, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0],
+                                [0, 0, 0, 0]])
+    expected_mask1 = jnp.array([[1, 0, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0],
+                                [0, 0, 0, 0]])
+    self.assertEqual(mask.shape, (2, 1, 4, 4))
+    np.testing.assert_array_equal(mask[0, 0], expected_mask0)
+    np.testing.assert_array_equal(mask[1, 0], expected_mask1)
+  def test_make_decoder_mask_composite_causal_attention(self):
+    decoder_target_tokens = jnp.array([[6, 7, 3, 4, 8, 9, 0]])
+    decoder_causal_attention = jnp.array([[1, 1, 0, 0, 1, 1, 0]])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens,
+        dtype=jnp.float32,
+        decoder_causal_attention=decoder_causal_attention)
+    expected_mask0 = jnp.array([[1, 1, 0, 0, 1, 1, 0], [1, 1, 0, 0, 1, 1, 0],
+                                [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0],
+                                [1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 0, 0, 0, 0, 0]])
+    self.assertEqual(mask.shape, (1, 1, 7, 7))
+    np.testing.assert_array_equal(mask[0, 0], expected_mask0)
+  def test_make_decoder_mask_composite_causal_attention_packed(self):
+    decoder_target_tokens = jnp.array([[6, 7, 3, 4, 8, 9, 2, 3, 4]])
+    decoder_segment_ids = jnp.array([[1, 1, 1, 1, 1, 1, 2, 2, 2]])
+    decoder_causal_attention = jnp.array([[1, 1, 0, 0, 1, 1, 1, 1, 0]])
+    mask = layers.make_decoder_mask(
+        decoder_target_tokens=decoder_target_tokens,
+        dtype=jnp.float32,
+        decoder_causal_attention=decoder_causal_attention,
+        decoder_segment_ids=decoder_segment_ids)
+    expected_mask0 = jnp.array([[1, 1, 0, 0, 1, 1, 0, 0, 0],
+                                [1, 1, 0, 0, 1, 1, 0, 0, 0],
+                                [1, 1, 1, 0, 0, 0, 0, 0, 0],
+                                [1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                [1, 1, 1, 1, 1, 1, 0, 0, 0],
+                                [1, 1, 1, 1, 1, 1, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 1, 1, 0],
+                                [0, 0, 0, 0, 0, 0, 1, 1, 0],
+                                [0, 0, 0, 0, 0, 0, 1, 1, 1]])
+    self.assertEqual(mask.shape, (1, 1, 9, 9))
+    np.testing.assert_array_equal(mask[0, 0], expected_mask0)
+  @parameterized.parameters({'f': 20}, {'f': 22})
+  def test_multihead_dot_product_attention(self, f):
+    # b: batch, f: emb_dim, q: q_len, k: kv_len, h: num_head, d: head_dim
+    b, q, h, d, k = 2, 3, 4, 5, 6
+    base_args = SelfAttentionArgs(num_heads=h, head_dim=d, dropout_rate=0)
+    args = base_args.init_args()
+    np.random.seed(0)
+    inputs_q = np.random.randn(b, q, f)
+    inputs_kv = np.random.randn(b, k, f)
+    # Projection: [b, q, f] -> [b, q, h, d]
+    # So the kernels have to be [f, h, d]
+    query_kernel = np.random.randn(f, h, d)
+    key_kernel = np.random.randn(f, h, d)
+    value_kernel = np.random.randn(f, h, d)
+    # `out` calculation: [b, q, h, d] -> [b, q, f]
+    # So kernel has to be [h, d, f]
+    out_kernel = np.random.randn(h, d, f)
+    params = {
+        'query': {
+            'kernel': query_kernel.reshape(f, -1)
+        },
+        'key': {
+            'kernel': key_kernel.reshape(f, -1)
+        },
+        'value': {
+            'kernel': value_kernel.reshape(f, -1)
+        },
+        'out': {
+            'kernel': out_kernel.reshape(-1, f)
+        }
+    }
+    y = layers.MultiHeadDotProductAttention(**args).apply(
+        {'params': freeze(params)}, inputs_q, inputs_kv)
+    query = np.einsum('bqf,fhd->bqhd', inputs_q, query_kernel)
+    key = np.einsum('bkf,fhd->bkhd', inputs_kv, key_kernel)
+    value = np.einsum('bkf,fhd->bkhd', inputs_kv, value_kernel)
+    logits = np.einsum('bqhd,bkhd->bhqk', query, key)
+    weights = nn.softmax(logits, axis=-1)
+    combined_value = np.einsum('bhqk,bkhd->bqhd', weights, value)
+    y_expected = np.einsum('bqhd,hdf->bqf', combined_value, out_kernel)
+    np.testing.assert_allclose(y, y_expected, rtol=1e-5, atol=1e-5)
+  def test_multihead_dot_product_attention_caching(self):
+    # b: batch, f: qkv_features, k: kv_len, h: num_head, d: head_dim
+    b, h, d, k = 2, 3, 4, 5
+    f = h * d
+    base_args = SelfAttentionArgs(num_heads=h, head_dim=d, dropout_rate=0)
+    args = base_args.init_args()
+    cache = {
+        'cached_key': np.zeros((b, h, d, k)),
+        'cached_value': np.zeros((b, h, d, k)),
+        'cache_index': np.array(0)
+    }
+    inputs_q = np.random.randn(b, 1, f)
+    inputs_kv = np.random.randn(b, 1, f)
+    # Mock dense general such that q, k, v projections are replaced by simple
+    # reshaping.
+    def mock_dense_general(self, x, **kwargs):  # pylint: disable=unused-argument
+      return x.reshape(b, -1, h, d)
+    with mock.patch.object(
+        layers.DenseGeneral, '__call__', new=mock_dense_general):
+      _, mutated = layers.MultiHeadDotProductAttention(**args).apply(
+          {'cache': freeze(cache)},
+          inputs_q,
+          inputs_kv,
+          decode=True,
+          mutable=['cache'])
+      updated_cache = mutated['cache']
+    # Perform the same mocked projection to generate the expected cache.
+    # (key|value): [b, 1, h, d]
+    key = mock_dense_general(None, inputs_kv)
+    value = mock_dense_general(None, inputs_kv)
+    # cached_(key|value): [b, h, d, k]
+    cache['cached_key'][:, :, :, 0] = key[:, 0, :, :]
+    cache['cached_value'][:, :, :, 0] = value[:, 0, :, :]
+    cache['cache_index'] = np.array(1)
+    for name, array in cache.items():
+      np.testing.assert_allclose(array, updated_cache[name])
+  def test_dot_product_attention(self):
+    # b: batch, f: emb_dim, q: q_len, k: kv_len, h: num_head, d: head_dim
+    b, q, h, d, k = 2, 3, 4, 5, 6
+    np.random.seed(0)
+    query = np.random.randn(b, q, h, d)
+    key = np.random.randn(b, k, h, d)
+    value = np.random.randn(b, k, h, d)
+    bias = np.random.randn(b, h, q, k)
+    attn_out = layers.dot_product_attention(query, key, value, bias=bias)
+    logits = np.einsum('bqhd,bkhd->bhqk', query, key)
+    weights = jax.nn.softmax(logits + bias, axis=-1)
+    expected = np.einsum('bhqk,bkhd->bqhd', weights, value)
+    np.testing.assert_allclose(attn_out, expected, atol=1e-6)
+class EmbeddingTest(parameterized.TestCase):
+  def test_embedder_raises_exception_for_incorrect_input_type(self):
+    """Tests that inputs are integers and that an exception is raised if not."""
+    embed = layers.Embed(num_embeddings=10, features=5)
+    inputs = np.expand_dims(np.arange(5, dtype=np.int64), 1)
+    variables = embed.init(jax.random.PRNGKey(0), inputs)
+    bad_inputs = inputs.astype(np.float32)
+    with self.assertRaisesRegex(
+        ValueError, 'Input type must be an integer or unsigned integer.'):
+      _ = embed.apply(variables, bad_inputs)
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'with_ones',
+          'init_fn': jax.nn.initializers.ones,
+          'num_embeddings': 10,
+          'features': 5,
+          'matrix_sum': 5 * 10,
+      }, {
+          'testcase_name': 'with_zeros',
+          'init_fn': jax.nn.initializers.zeros,
+          'num_embeddings': 10,
+          'features': 5,
+          'matrix_sum': 0,
+      })
+  def test_embedding_initializes_correctly(self, init_fn, num_embeddings,
+                                           features, matrix_sum):
+    """Tests if the Embed class initializes with the requested initializer."""
+    embed = layers.Embed(
+        num_embeddings=num_embeddings,
+        features=features,
+        embedding_init=init_fn)
+    inputs = np.expand_dims(np.arange(5, dtype=np.int64), 1)
+    variables = embed.init(jax.random.PRNGKey(0), inputs)
+    embedding_matrix = variables['params']['embedding']
+    self.assertEqual(int(np.sum(embedding_matrix)), matrix_sum)
+  def test_embedding_matrix_shape(self):
+    """Tests that the embedding matrix has the right shape."""
+    num_embeddings = 10
+    features = 5
+    embed = layers.Embed(num_embeddings=num_embeddings, features=features)
+    inputs = np.expand_dims(np.arange(features, dtype=np.int64), 1)
+    variables = embed.init(jax.random.PRNGKey(0), inputs)
+    embedding_matrix = variables['params']['embedding']
+    self.assertEqual((num_embeddings, features), embedding_matrix.shape)
+  def test_embedding_attend(self):
+    """Tests that attending with ones returns sum of embedding vectors."""
+    features = 5
+    embed = layers.Embed(num_embeddings=10, features=features)
+    inputs = np.array([[1]], dtype=np.int64)
+    variables = embed.init(jax.random.PRNGKey(0), inputs)
+    query = np.ones(features, dtype=np.float32)
+    result = embed.apply(variables, query, method=embed.attend)
+    expected = np.sum(variables['params']['embedding'], -1)
+    np.testing.assert_array_almost_equal(result, expected)
+class DenseTest(parameterized.TestCase):
+  def test_dense_general_no_bias(self):
+    rng = random.PRNGKey(0)
+    x = jnp.ones((1, 3))
+    model = layers.DenseGeneral(
+        features=4,
+        kernel_init=initializers.ones,
+    )
+    y, _ = model.init_with_output(rng, x)
+    self.assertEqual(y.shape, (1, 4))
+    np.testing.assert_allclose(y, np.full((1, 4), 3.))
+  def test_dense_general_two_features(self):
+    rng = random.PRNGKey(0)
+    x = jnp.ones((1, 3))
+    model = layers.DenseGeneral(
+        features=(2, 2),
+        kernel_init=initializers.ones,
+    )
+    y, _ = model.init_with_output(rng, x)
+    # We transform the last input dimension to two output dimensions (2, 2).
+    np.testing.assert_allclose(y, np.full((1, 2, 2), 3.))
+  def test_dense_general_two_axes(self):
+    rng = random.PRNGKey(0)
+    x = jnp.ones((1, 2, 2))
+    model = layers.DenseGeneral(
+        features=3,
+        axis=(-2, 2),  # Note: this is the same as (1, 2).
+        kernel_init=initializers.ones,
+    )
+    y, _ = model.init_with_output(rng, x)
+    # We transform the last two input dimensions (2, 2) to one output dimension.
+    np.testing.assert_allclose(y, np.full((1, 3), 4.))
+  def test_mlp_same_out_dim(self):
+    module = layers.MlpBlock(
+        intermediate_dim=4,
+        activations=('relu',),
+        kernel_init=nn.initializers.xavier_uniform(),
+        dtype=jnp.float32,
+    )
+    inputs = np.array(
+        [
+            # Batch 1.
+            [[1, 1], [1, 1], [1, 2]],
+            # Batch 2.
+            [[2, 2], [3, 1], [2, 2]],
+        ],
+        dtype=np.float32)
+    params = module.init(random.PRNGKey(0), inputs, deterministic=True)
+    self.assertEqual(
+        jax.tree_map(lambda a: a.tolist(), params), {
+            'params': {
+                'wi': {
+                    'kernel': [[
+                        -0.8675811290740967, 0.08417510986328125,
+                        0.022586345672607422, -0.9124102592468262
+                    ],
+                               [
+                                   -0.19464373588562012, 0.49809837341308594,
+                                   0.7808468341827393, 0.9267289638519287
+                               ]],
+                },
+                'wo': {
+                    'kernel': [[0.01154780387878418, 0.1397249698638916],
+                               [0.974980354309082, 0.5903260707855225],
+                               [-0.05997943878173828, 0.616570234298706],
+                               [0.2934272289276123, 0.8181164264678955]],
+                },
+            },
+            'params_axes': {
+                'wi': {
+                    'kernel_axes': AxisMetadata(names=('embed', 'mlp')),
+                },
+                'wo': {
+                    'kernel_axes': AxisMetadata(names=('mlp', 'embed')),
+                },
+            },
+        })
+    result = module.apply(params, inputs, deterministic=True)
+    np.testing.assert_allclose(
+        result.tolist(),
+        [[[0.5237172245979309, 0.8508185744285583],
+          [0.5237172245979309, 0.8508185744285583],
+          [1.2344461679458618, 2.3844780921936035]],
+         [[1.0474344491958618, 1.7016371488571167],
+          [0.6809444427490234, 0.9663378596305847],
+          [1.0474344491958618, 1.7016371488571167]]],
+        rtol=1e-6,
+    )
+if __name__ == '__main__':
+  absltest.main()

mt3/metrics.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transcription metrics."""
+import collections
+import copy
+import functools
+from typing import Any, Iterable, Mapping, Optional, Sequence
+import mir_eval
+from mt3 import event_codec
+from mt3 import metrics_utils
+from mt3 import note_sequences
+from mt3 import spectrograms
+from mt3 import summaries
+from mt3 import vocabularies
+import note_seq
+import numpy as np
+import seqio
+def _program_aware_note_scores(
+    ref_ns: note_seq.NoteSequence,
+    est_ns: note_seq.NoteSequence,
+    granularity_type: str
+) -> Mapping[str, float]:
+  """Compute precision/recall/F1 for notes taking program into account.
+  For non-drum tracks, uses onsets and offsets. For drum tracks, uses onsets
+  only. Applies MIDI program map of specified granularity type.
+  Args:
+    ref_ns: Reference NoteSequence with ground truth labels.
+    est_ns: Estimated NoteSequence.
+    granularity_type: String key in vocabularies.PROGRAM_GRANULARITIES dict.
+  Returns:
+    A dictionary containing precision, recall, and F1 score.
+  """
+  program_map_fn = vocabularies.PROGRAM_GRANULARITIES[
+      granularity_type].program_map_fn
+  ref_ns = copy.deepcopy(ref_ns)
+  for note in ref_ns.notes:
+    if not note.is_drum:
+      note.program = program_map_fn(note.program)
+  est_ns = copy.deepcopy(est_ns)
+  for note in est_ns.notes:
+    if not note.is_drum:
+      note.program = program_map_fn(note.program)
+  program_and_is_drum_tuples = (
+      set((note.program, note.is_drum) for note in ref_ns.notes) |
+      set((note.program, note.is_drum) for note in est_ns.notes)
+  )
+  drum_precision_sum = 0.0
+  drum_precision_count = 0
+  drum_recall_sum = 0.0
+  drum_recall_count = 0
+  nondrum_precision_sum = 0.0
+  nondrum_precision_count = 0
+  nondrum_recall_sum = 0.0
+  nondrum_recall_count = 0
+  for program, is_drum in program_and_is_drum_tuples:
+    est_track = note_sequences.extract_track(est_ns, program, is_drum)
+    ref_track = note_sequences.extract_track(ref_ns, program, is_drum)
+    est_intervals, est_pitches, unused_est_velocities = (
+        note_seq.sequences_lib.sequence_to_valued_intervals(est_track))
+    ref_intervals, ref_pitches, unused_ref_velocities = (
+        note_seq.sequences_lib.sequence_to_valued_intervals(ref_track))
+    args = {
+        'ref_intervals': ref_intervals, 'ref_pitches': ref_pitches,
+        'est_intervals': est_intervals, 'est_pitches': est_pitches
+    }
+    if is_drum:
+      args['offset_ratio'] = None
+    precision, recall, unused_f_measure, unused_avg_overlap_ratio = (
+        mir_eval.transcription.precision_recall_f1_overlap(**args))
+    if is_drum:
+      drum_precision_sum += precision * len(est_intervals)
+      drum_precision_count += len(est_intervals)
+      drum_recall_sum += recall * len(ref_intervals)
+      drum_recall_count += len(ref_intervals)
+    else:
+      nondrum_precision_sum += precision * len(est_intervals)
+      nondrum_precision_count += len(est_intervals)
+      nondrum_recall_sum += recall * len(ref_intervals)
+      nondrum_recall_count += len(ref_intervals)
+  precision_sum = drum_precision_sum + nondrum_precision_sum
+  precision_count = drum_precision_count + nondrum_precision_count
+  recall_sum = drum_recall_sum + nondrum_recall_sum
+  recall_count = drum_recall_count + nondrum_recall_count
+  precision = (precision_sum / precision_count) if precision_count else 0
+  recall = (recall_sum / recall_count) if recall_count else 0
+  f_measure = mir_eval.util.f_measure(precision, recall)
+  drum_precision = ((drum_precision_sum / drum_precision_count)
+                    if drum_precision_count else 0)
+  drum_recall = ((drum_recall_sum / drum_recall_count)
+                 if drum_recall_count else 0)
+  drum_f_measure = mir_eval.util.f_measure(drum_precision, drum_recall)
+  nondrum_precision = ((nondrum_precision_sum / nondrum_precision_count)
+                       if nondrum_precision_count else 0)
+  nondrum_recall = ((nondrum_recall_sum / nondrum_recall_count)
+                    if nondrum_recall_count else 0)
+  nondrum_f_measure = mir_eval.util.f_measure(nondrum_precision, nondrum_recall)
+  return {
+      f'Onset + offset + program precision ({granularity_type})': precision,
+      f'Onset + offset + program recall ({granularity_type})': recall,
+      f'Onset + offset + program F1 ({granularity_type})': f_measure,
+      f'Drum onset precision ({granularity_type})': drum_precision,
+      f'Drum onset recall ({granularity_type})': drum_recall,
+      f'Drum onset F1 ({granularity_type})': drum_f_measure,
+      f'Nondrum onset + offset + program precision ({granularity_type})':
+          nondrum_precision,
+      f'Nondrum onset + offset + program recall ({granularity_type})':
+          nondrum_recall,
+      f'Nondrum onset + offset + program F1 ({granularity_type})':
+          nondrum_f_measure
+  }
+def _note_onset_tolerance_sweep(
+    ref_ns: note_seq.NoteSequence, est_ns: note_seq.NoteSequence,
+    tolerances: Iterable[float] = (0.01, 0.02, 0.05, 0.1, 0.2, 0.5)
+) -> Mapping[str, float]:
+  """Compute note precision/recall/F1 across a range of tolerances."""
+  est_intervals, est_pitches, unused_est_velocities = (
+      note_seq.sequences_lib.sequence_to_valued_intervals(est_ns))
+  ref_intervals, ref_pitches, unused_ref_velocities = (
+      note_seq.sequences_lib.sequence_to_valued_intervals(ref_ns))
+  scores = {}
+  for tol in tolerances:
+    precision, recall, f_measure, _ = (
+        mir_eval.transcription.precision_recall_f1_overlap(
+            ref_intervals=ref_intervals, ref_pitches=ref_pitches,
+            est_intervals=est_intervals, est_pitches=est_pitches,
+            onset_tolerance=tol, offset_min_tolerance=tol))
+    scores[f'Onset + offset precision ({tol})'] = precision
+    scores[f'Onset + offset recall ({tol})'] = recall
+    scores[f'Onset + offset F1 ({tol})'] = f_measure
+  return scores
+def transcription_metrics(
+    targets: Sequence[Mapping[str, Any]],
+    predictions: Sequence[Mapping[str, Any]],
+    codec: event_codec.Codec,
+    spectrogram_config: spectrograms.SpectrogramConfig,
+    onsets_only: bool,
+    use_ties: bool,
+    track_specs: Optional[Sequence[note_sequences.TrackSpec]] = None,
+    num_summary_examples: int = 5,
+    frame_fps: float = 62.5,
+    frame_velocity_threshold: int = 30,
+) -> Mapping[str, seqio.metrics.MetricValue]:
+  """Compute mir_eval transcription metrics."""
+  if onsets_only and use_ties:
+    raise ValueError('Ties not compatible with onset-only transcription.')
+  if onsets_only:
+    encoding_spec = note_sequences.NoteOnsetEncodingSpec
+  elif not use_ties:
+    encoding_spec = note_sequences.NoteEncodingSpec
+  else:
+    encoding_spec = note_sequences.NoteEncodingWithTiesSpec
+  # The first target for each full example contains the NoteSequence; just
+  # organize by ID.
+  full_targets = {}
+  for target in targets:
+    if target['ref_ns']:
+      full_targets[target['unique_id']] = {'ref_ns': target['ref_ns']}
+  # Gather all predictions for the same ID and concatenate them in time order,
+  # to construct full-length predictions.
+  full_predictions = metrics_utils.combine_predictions_by_id(
+      predictions=predictions,
+      combine_predictions_fn=functools.partial(
+          metrics_utils.event_predictions_to_ns,
+          codec=codec,
+          encoding_spec=encoding_spec))
+  assert sorted(full_targets.keys()) == sorted(full_predictions.keys())
+  full_target_prediction_pairs = [
+      (full_targets[id], full_predictions[id])
+      for id in sorted(full_targets.keys())
+  ]
+  scores = collections.defaultdict(list)
+  all_track_pianorolls = collections.defaultdict(list)
+  for target, prediction in full_target_prediction_pairs:
+    scores['Invalid events'].append(prediction['est_invalid_events'])
+    scores['Dropped events'].append(prediction['est_dropped_events'])
+    def remove_drums(ns):
+      ns_drumless = note_seq.NoteSequence()
+      ns_drumless.CopyFrom(ns)
+      del ns_drumless.notes[:]
+      ns_drumless.notes.extend([note for note in ns.notes if not note.is_drum])
+      return ns_drumless
+    est_ns_drumless = remove_drums(prediction['est_ns'])
+    ref_ns_drumless = remove_drums(target['ref_ns'])
+    # Whether or not there are separate tracks, compute metrics for the full
+    # NoteSequence minus drums.
+    est_tracks = [est_ns_drumless]
+    ref_tracks = [ref_ns_drumless]
+    use_track_offsets = [not onsets_only]
+    use_track_velocities = [not onsets_only]
+    track_instrument_names = ['']
+    if track_specs is not None:
+      # Compute transcription metrics separately for each track.
+      for spec in track_specs:
+        est_tracks.append(note_sequences.extract_track(
+            prediction['est_ns'], spec.program, spec.is_drum))
+        ref_tracks.append(note_sequences.extract_track(
+            target['ref_ns'], spec.program, spec.is_drum))
+        use_track_offsets.append(not onsets_only and not spec.is_drum)
+        use_track_velocities.append(not onsets_only)
+        track_instrument_names.append(spec.name)
+    for est_ns, ref_ns, use_offsets, use_velocities, instrument_name in zip(
+        est_tracks, ref_tracks, use_track_offsets, use_track_velocities,
+        track_instrument_names):
+      track_scores = {}
+      est_intervals, est_pitches, est_velocities = (
+          note_seq.sequences_lib.sequence_to_valued_intervals(est_ns))
+      ref_intervals, ref_pitches, ref_velocities = (
+          note_seq.sequences_lib.sequence_to_valued_intervals(ref_ns))
+      # Precision / recall / F1 using onsets (and pitches) only.
+      precision, recall, f_measure, avg_overlap_ratio = (
+          mir_eval.transcription.precision_recall_f1_overlap(
+              ref_intervals=ref_intervals,
+              ref_pitches=ref_pitches,
+              est_intervals=est_intervals,
+              est_pitches=est_pitches,
+              offset_ratio=None))
+      del avg_overlap_ratio
+      track_scores['Onset precision'] = precision
+      track_scores['Onset recall'] = recall
+      track_scores['Onset F1'] = f_measure
+      if use_offsets:
+        # Precision / recall / F1 using onsets and offsets.
+        precision, recall, f_measure, avg_overlap_ratio = (
+            mir_eval.transcription.precision_recall_f1_overlap(
+                ref_intervals=ref_intervals,
+                ref_pitches=ref_pitches,
+                est_intervals=est_intervals,
+                est_pitches=est_pitches))
+        del avg_overlap_ratio
+        track_scores['Onset + offset precision'] = precision
+        track_scores['Onset + offset recall'] = recall
+        track_scores['Onset + offset F1'] = f_measure
+      if use_velocities:
+        # Precision / recall / F1 using onsets and velocities (no offsets).
+        precision, recall, f_measure, avg_overlap_ratio = (
+            mir_eval.transcription_velocity.precision_recall_f1_overlap(
+                ref_intervals=ref_intervals,
+                ref_pitches=ref_pitches,
+                ref_velocities=ref_velocities,
+                est_intervals=est_intervals,
+                est_pitches=est_pitches,
+                est_velocities=est_velocities,
+                offset_ratio=None))
+        track_scores['Onset + velocity precision'] = precision
+        track_scores['Onset + velocity recall'] = recall
+        track_scores['Onset + velocity F1'] = f_measure
+      if use_offsets and use_velocities:
+        # Precision / recall / F1 using onsets, offsets, and velocities.
+        precision, recall, f_measure, avg_overlap_ratio = (
+            mir_eval.transcription_velocity.precision_recall_f1_overlap(
+                ref_intervals=ref_intervals,
+                ref_pitches=ref_pitches,
+                ref_velocities=ref_velocities,
+                est_intervals=est_intervals,
+                est_pitches=est_pitches,
+                est_velocities=est_velocities))
+        track_scores['Onset + offset + velocity precision'] = precision
+        track_scores['Onset + offset + velocity recall'] = recall
+        track_scores['Onset + offset + velocity F1'] = f_measure
+      # Calculate framewise metrics.
+      is_drum = all([n.is_drum for n in ref_ns.notes])
+      ref_pr = metrics_utils.get_prettymidi_pianoroll(
+          ref_ns, frame_fps, is_drum=is_drum)
+      est_pr = metrics_utils.get_prettymidi_pianoroll(
+          est_ns, frame_fps, is_drum=is_drum)
+      all_track_pianorolls[instrument_name].append((est_pr, ref_pr))
+      frame_precision, frame_recall, frame_f1 = metrics_utils.frame_metrics(
+          ref_pr, est_pr, velocity_threshold=frame_velocity_threshold)
+      track_scores['Frame Precision'] = frame_precision
+      track_scores['Frame Recall'] = frame_recall
+      track_scores['Frame F1'] = frame_f1
+      for metric_name, metric_value in track_scores.items():
+        if instrument_name:
+          scores[f'{instrument_name}/{metric_name}'].append(metric_value)
+        else:
+          scores[metric_name].append(metric_value)
+    # Add program-aware note metrics for all program granularities.
+    # Note that this interacts with the training program granularity; in
+    # particular granularities *higher* than the training granularity are likely
+    # to have poor metrics.
+    for granularity_type in vocabularies.PROGRAM_GRANULARITIES:
+      for name, score in _program_aware_note_scores(
+          target['ref_ns'], prediction['est_ns'],
+          granularity_type=granularity_type).items():
+        scores[name].append(score)
+    # Add (non-program-aware) note metrics across a range of onset/offset
+    # tolerances.
+    for name, score in _note_onset_tolerance_sweep(
+        ref_ns=ref_ns_drumless, est_ns=est_ns_drumless).items():
+      scores[name].append(score)
+  mean_scores = {k: np.mean(v) for k, v in scores.items()}
+  score_histograms = {'%s (hist)' % k: seqio.metrics.Histogram(np.array(v))
+                      for k, v in scores.items()}
+  # Pick several examples to summarize.
+  targets_to_summarize, predictions_to_summarize = zip(
+      *full_target_prediction_pairs[:num_summary_examples])
+  # Compute audio summaries.
+  audio_summaries = summaries.audio_summaries(
+      targets=targets_to_summarize,
+      predictions=predictions_to_summarize,
+      spectrogram_config=spectrogram_config)
+  # Compute transcription summaries.
+  transcription_summaries = summaries.transcription_summaries(
+      targets=targets_to_summarize,
+      predictions=predictions_to_summarize,
+      spectrogram_config=spectrogram_config,
+      ns_feature_suffix='ns',
+      track_specs=track_specs)
+  pianorolls_to_summarize = {
+      k: v[:num_summary_examples] for k, v in all_track_pianorolls.items()
+  }
+  prettymidi_pianoroll_summaries = summaries.prettymidi_pianoroll(
+      pianorolls_to_summarize, fps=frame_fps)
+  return {
+      **mean_scores,
+      **score_histograms,
+      **audio_summaries,
+      **transcription_summaries,
+      **prettymidi_pianoroll_summaries,
+  }

mt3/metrics_utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for transcription metrics."""
+import collections
+import functools
+from typing import Any, Callable, Mapping, Optional, Sequence, Tuple, TypeVar
+from mt3 import event_codec
+from mt3 import note_sequences
+from mt3 import run_length_encoding
+import note_seq
+import numpy as np
+import pretty_midi
+import sklearn
+S = TypeVar('S')
+T = TypeVar('T')
+CombineExamplesFunctionType = Callable[[Sequence[Mapping[str, Any]]],
+                                       Mapping[str, Any]]
+def _group_predictions_by_id(
+    predictions: Sequence[Mapping[str, T]]
+) -> Mapping[str, Sequence[T]]:
+  predictions_by_id = collections.defaultdict(list)
+  for pred in predictions:
+    predictions_by_id[pred['unique_id']].append(pred)
+  return predictions_by_id
+def combine_predictions_by_id(
+    predictions: Sequence[Mapping[str, Any]],
+    combine_predictions_fn: CombineExamplesFunctionType
+) -> Mapping[str, Mapping[str, Any]]:
+  """Concatenate predicted examples, grouping by ID and sorting by time."""
+  predictions_by_id = _group_predictions_by_id(predictions)
+  return {
+      id: combine_predictions_fn(preds)
+      for id, preds in predictions_by_id.items()
+  }
+def decode_and_combine_predictions(
+    predictions: Sequence[Mapping[str, Any]],
+    init_state_fn: Callable[[], S],
+    begin_segment_fn: Callable[[S], None],
+    decode_tokens_fn: Callable[[S, Sequence[int], int, Optional[int]],
+                               Tuple[int, int]],
+    flush_state_fn: Callable[[S], T]
+) -> Tuple[T, int, int]:
+  """Decode and combine a sequence of predictions to a full result.
+  For time-based events, this usually means concatenation.
+  Args:
+    predictions: List of predictions, each of which is a dictionary containing
+        estimated tokens ('est_tokens') and start time ('start_time') fields.
+    init_state_fn: Function that takes no arguments and returns an initial
+        decoding state.
+    begin_segment_fn: Function that updates the decoding state at the beginning
+        of a segment.
+    decode_tokens_fn: Function that takes a decoding state, estimated tokens
+        (for a single segment), start time, and max time, and processes the
+        tokens, updating the decoding state in place. Also returns the number of
+        invalid and dropped events for the segment.
+    flush_state_fn: Function that flushes the final decoding state into the
+        result.
+  Returns:
+    result: The full combined decoding.
+    total_invalid_events: Total number of invalid event tokens across all
+        predictions.
+    total_dropped_events: Total number of dropped event tokens across all
+        predictions.
+  """
+  sorted_predictions = sorted(predictions, key=lambda pred: pred['start_time'])
+  state = init_state_fn()
+  total_invalid_events = 0
+  total_dropped_events = 0
+  for pred_idx, pred in enumerate(sorted_predictions):
+    begin_segment_fn(state)
+    # Depending on the audio token hop length, each symbolic token could be
+    # associated with multiple audio frames. Since we split up the audio frames
+    # into segments for prediction, this could lead to overlap. To prevent
+    # overlap issues, ensure that the current segment does not make any
+    # predictions for the time period covered by the subsequent segment.
+    max_decode_time = None
+    if pred_idx < len(sorted_predictions) - 1:
+      max_decode_time = sorted_predictions[pred_idx + 1]['start_time']
+    invalid_events, dropped_events = decode_tokens_fn(
+        state, pred['est_tokens'], pred['start_time'], max_decode_time)
+    total_invalid_events += invalid_events
+    total_dropped_events += dropped_events
+  return flush_state_fn(state), total_invalid_events, total_dropped_events
+def event_predictions_to_ns(
+    predictions: Sequence[Mapping[str, Any]], codec: event_codec.Codec,
+    encoding_spec: note_sequences.NoteEncodingSpecType
+) -> Mapping[str, Any]:
+  """Convert a sequence of predictions to a combined NoteSequence."""
+  ns, total_invalid_events, total_dropped_events = decode_and_combine_predictions(
+      predictions=predictions,
+      init_state_fn=encoding_spec.init_decoding_state_fn,
+      begin_segment_fn=encoding_spec.begin_decoding_segment_fn,
+      decode_tokens_fn=functools.partial(
+          run_length_encoding.decode_events,
+          codec=codec,
+          decode_event_fn=encoding_spec.decode_event_fn),
+      flush_state_fn=encoding_spec.flush_decoding_state_fn)
+  # Also concatenate raw inputs from all predictions.
+  sorted_predictions = sorted(predictions, key=lambda pred: pred['start_time'])
+  raw_inputs = np.concatenate(
+      [pred['raw_inputs'] for pred in sorted_predictions], axis=0)
+  start_times = [pred['start_time'] for pred in sorted_predictions]
+  return {
+      'raw_inputs': raw_inputs,
+      'start_times': start_times,
+      'est_ns': ns,
+      'est_invalid_events': total_invalid_events,
+      'est_dropped_events': total_dropped_events,
+  }
+def get_prettymidi_pianoroll(ns: note_seq.NoteSequence, fps: float,
+                             is_drum: bool):
+  """Convert NoteSequence to pianoroll through pretty_midi."""
+  for note in ns.notes:
+    if is_drum or note.end_time - note.start_time < 0.05:
+      # Give all drum notes a fixed length, and all others a min length
+      note.end_time = note.start_time + 0.05
+  pm = note_seq.note_sequence_to_pretty_midi(ns)
+  end_time = pm.get_end_time()
+  cc = [
+      # all sound off
+      pretty_midi.ControlChange(number=120, value=0, time=end_time),
+      # all notes off
+      pretty_midi.ControlChange(number=123, value=0, time=end_time)
+  ]
+  pm.instruments[0].control_changes = cc
+  if is_drum:
+    # If inst.is_drum is set, pretty_midi will return an all zero pianoroll.
+    for inst in pm.instruments:
+      inst.is_drum = False
+  pianoroll = pm.get_piano_roll(fs=fps)
+  return pianoroll
+def frame_metrics(ref_pianoroll: np.ndarray,
+                  est_pianoroll: np.ndarray,
+                  velocity_threshold: int) -> Tuple[float, float, float]:
+  """Frame Precision, Recall, and F1."""
+  # Pad to same length
+  if ref_pianoroll.shape[1] > est_pianoroll.shape[1]:
+    diff = ref_pianoroll.shape[1] - est_pianoroll.shape[1]
+    est_pianoroll = np.pad(est_pianoroll, [(0, 0), (0, diff)], mode='constant')
+  elif est_pianoroll.shape[1] > ref_pianoroll.shape[1]:
+    diff = est_pianoroll.shape[1] - ref_pianoroll.shape[1]
+    ref_pianoroll = np.pad(ref_pianoroll, [(0, 0), (0, diff)], mode='constant')
+  # For ref, remove any notes that are too quiet (consistent with Cerberus.)
+  ref_frames_bool = ref_pianoroll > velocity_threshold
+  # For est, keep all predicted notes.
+  est_frames_bool = est_pianoroll > 0
+  precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
+      ref_frames_bool.flatten(),
+      est_frames_bool.flatten(),
+      labels=[True, False])
+  return precision[0], recall[0], f1[0]

mt3/metrics_utils_test.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for metrics_utils."""
+from mt3 import event_codec
+from mt3 import metrics_utils
+from mt3 import note_sequences
+import note_seq
+import numpy as np
+import tensorflow as tf
+class MetricsUtilsTest(tf.test.TestCase):
+  def test_event_predictions_to_ns(self):
+    predictions = [
+        {
+            'raw_inputs': [0, 0],
+            'start_time': 0.0,
+            'est_tokens': [20, 160],
+        },
+        {
+            'raw_inputs': [1, 1],
+            'start_time': 0.4,
+            # These last 2 events should be dropped.
+            'est_tokens': [20, 161, 50, 162],
+        },
+        {
+            'raw_inputs': [2, 2],
+            'start_time': 0.8,
+            'est_tokens': [163, 20, 164]
+        },
+    ]
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=59,
+        velocity=100,
+        start_time=0.20,
+        end_time=0.21)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=0.60,
+        end_time=0.61)
+    expected_ns.notes.add(
+        pitch=62,
+        velocity=100,
+        start_time=0.80,
+        end_time=0.81)
+    expected_ns.notes.add(
+        pitch=63,
+        velocity=100,
+        start_time=1.00,
+        end_time=1.01)
+    expected_ns.total_time = 1.01
+    codec = event_codec.Codec(
+        max_shift_steps=100,
+        steps_per_second=100,
+        event_ranges=[
+            event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                                   note_seq.MAX_MIDI_PITCH)])
+    res = metrics_utils.event_predictions_to_ns(
+        predictions, codec=codec,
+        encoding_spec=note_sequences.NoteOnsetEncodingSpec)
+    self.assertProtoEquals(expected_ns, res['est_ns'])
+    self.assertEqual(0, res['est_invalid_events'])
+    self.assertEqual(2, res['est_dropped_events'])
+    np.testing.assert_array_equal([0, 0, 1, 1, 2, 2], res['raw_inputs'])
+  def test_event_predictions_to_ns_with_offsets(self):
+    predictions = [
+        {
+            'raw_inputs': [0, 0],
+            'start_time': 0.0,
+            'est_tokens': [20, 356, 160],
+        },
+        {
+            'raw_inputs': [1, 1],
+            'start_time': 0.4,
+            'est_tokens': [20, 292, 161],
+        },
+        {
+            'raw_inputs': [2, 2],
+            'start_time': 0.8,
+            'est_tokens': [20, 229, 160, 161]
+        },
+    ]
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=59,
+        velocity=127,
+        start_time=0.20,
+        end_time=1.00)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=63,
+        start_time=0.60,
+        end_time=1.00)
+    expected_ns.total_time = 1.00
+    codec = event_codec.Codec(
+        max_shift_steps=100,
+        steps_per_second=100,
+        event_ranges=[
+            event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                                   note_seq.MAX_MIDI_PITCH),
+            event_codec.EventRange('velocity', 0, 127)
+        ])
+    res = metrics_utils.event_predictions_to_ns(
+        predictions, codec=codec, encoding_spec=note_sequences.NoteEncodingSpec)
+    self.assertProtoEquals(expected_ns, res['est_ns'])
+    self.assertEqual(0, res['est_invalid_events'])
+    self.assertEqual(0, res['est_dropped_events'])
+    np.testing.assert_array_equal([0, 0, 1, 1, 2, 2], res['raw_inputs'])
+  def test_event_predictions_to_ns_multitrack(self):
+    predictions = [
+        {
+            'raw_inputs': [0, 0],
+            'start_time': 0.0,
+            'est_tokens': [20, 517, 356, 160],
+        },
+        {
+            'raw_inputs': [1, 1],
+            'start_time': 0.4,
+            'est_tokens': [20, 356, 399],
+        },
+        {
+            'raw_inputs': [2, 2],
+            'start_time': 0.8,
+            'est_tokens': [20, 517, 229, 160]
+        },
+    ]
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=42,
+        velocity=127,
+        start_time=0.60,
+        end_time=0.61,
+        is_drum=True,
+        instrument=9)
+    expected_ns.notes.add(
+        pitch=59,
+        velocity=127,
+        start_time=0.20,
+        end_time=1.00,
+        program=32)
+    expected_ns.total_time = 1.00
+    codec = event_codec.Codec(
+        max_shift_steps=100,
+        steps_per_second=100,
+        event_ranges=[
+            event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                                   note_seq.MAX_MIDI_PITCH),
+            event_codec.EventRange('velocity', 0, 127),
+            event_codec.EventRange('drum', note_seq.MIN_MIDI_PITCH,
+                                   note_seq.MAX_MIDI_PITCH),
+            event_codec.EventRange('program', note_seq.MIN_MIDI_PROGRAM,
+                                   note_seq.MAX_MIDI_PROGRAM)
+        ])
+    res = metrics_utils.event_predictions_to_ns(
+        predictions, codec=codec, encoding_spec=note_sequences.NoteEncodingSpec)
+    self.assertProtoEquals(expected_ns, res['est_ns'])
+    self.assertEqual(0, res['est_invalid_events'])
+    self.assertEqual(0, res['est_dropped_events'])
+    np.testing.assert_array_equal([0, 0, 1, 1, 2, 2], res['raw_inputs'])
+  def test_event_predictions_to_ns_multitrack_ties(self):
+    predictions = [
+        {
+            'raw_inputs': [0, 0],
+            'start_time': 0.0,
+            'est_tokens': [613,  # no tied notes
+                           20, 517, 356, 160],
+        },
+        {
+            'raw_inputs': [1, 1],
+            'start_time': 0.4,
+            'est_tokens': [517, 160, 613,  # tied note
+                           20, 356, 399],
+        },
+        {
+            'raw_inputs': [2, 2],
+            'start_time': 0.8,
+            'est_tokens': [613]  # no tied notes, causing active note to end
+        },
+    ]
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=42,
+        velocity=127,
+        start_time=0.60,
+        end_time=0.61,
+        is_drum=True,
+        instrument=9)
+    expected_ns.notes.add(
+        pitch=59,
+        velocity=127,
+        start_time=0.20,
+        end_time=0.80,
+        program=32)
+    expected_ns.total_time = 0.80
+    codec = event_codec.Codec(
+        max_shift_steps=100,
+        steps_per_second=100,
+        event_ranges=[
+            event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                                   note_seq.MAX_MIDI_PITCH),
+            event_codec.EventRange('velocity', 0, 127),
+            event_codec.EventRange('drum', note_seq.MIN_MIDI_PITCH,
+                                   note_seq.MAX_MIDI_PITCH),
+            event_codec.EventRange('program', note_seq.MIN_MIDI_PROGRAM,
+                                   note_seq.MAX_MIDI_PROGRAM),
+            event_codec.EventRange('tie', 0, 0)
+        ])
+    res = metrics_utils.event_predictions_to_ns(
+        predictions, codec=codec,
+        encoding_spec=note_sequences.NoteEncodingWithTiesSpec)
+    self.assertProtoEquals(expected_ns, res['est_ns'])
+    self.assertEqual(0, res['est_invalid_events'])
+    self.assertEqual(0, res['est_dropped_events'])
+    np.testing.assert_array_equal([0, 0, 1, 1, 2, 2], res['raw_inputs'])
+  def test_frame_metrics(self):
+    ref = np.zeros(shape=(128, 5))
+    est = np.zeros(shape=(128, 5))
+    # one overlapping note, two false positives, two false negatives
+    ref[10, 0] = 127
+    ref[10, 1] = 127
+    ref[10, 2] = 127
+    est[10, 2] = 127
+    est[10, 3] = 127
+    est[10, 4] = 127
+    prec, rec, _ = metrics_utils.frame_metrics(ref, est, velocity_threshold=1)
+    np.testing.assert_approx_equal(prec, 1/3)
+    np.testing.assert_approx_equal(rec, 1/3)
+if __name__ == '__main__':
+  tf.test.main()

mt3/mixing.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for mixing (in the audio sense) multiple transcription examples."""
+from typing import Callable, Optional, Sequence
+import gin
+from mt3 import event_codec
+from mt3 import run_length_encoding
+import numpy as np
+import seqio
+import tensorflow as tf
+@gin.configurable
+def mix_transcription_examples(
+    ds: tf.data.Dataset,
+    sequence_length: seqio.preprocessors.SequenceLengthType,
+    output_features: seqio.preprocessors.OutputFeaturesType,
+    codec: event_codec.Codec,
+    inputs_feature_key: str = 'inputs',
+    targets_feature_keys: Sequence[str] = ('targets',),
+    max_examples_per_mix: Optional[int] = None,
+    shuffle_buffer_size: int = seqio.SHUFFLE_BUFFER_SIZE
+) -> Callable[..., tf.data.Dataset]:
+  """Preprocessor that mixes together "batches" of transcription examples.
+  Args:
+    ds: Dataset of individual transcription examples, each of which should
+        have an 'inputs' field containing 1D audio samples (currently only
+        audio encoders that use raw samples as an intermediate representation
+        are supported), and a 'targets' field containing run-length encoded
+        note events.
+    sequence_length: Dictionary mapping feature key to length.
+    output_features: Dictionary mapping feature key to spec.
+    codec: An event_codec.Codec used to interpret the target events.
+    inputs_feature_key: Feature key for inputs which will be mixed as audio.
+    targets_feature_keys: List of feature keys for targets, each of which will
+        be merged (separately) as run-length encoded note events.
+    max_examples_per_mix: Maximum number of individual examples to mix together.
+    shuffle_buffer_size: Size of shuffle buffer to use for shuffle prior to
+        mixing.
+  Returns:
+    Dataset containing mixed examples.
+  """
+  if max_examples_per_mix is None:
+    return ds
+  # TODO(iansimon): is there a way to use seqio's seed?
+  ds = tf.data.Dataset.sample_from_datasets([
+      ds.shuffle(
+          buffer_size=shuffle_buffer_size // max_examples_per_mix
+      ).padded_batch(batch_size=i) for i in range(1, max_examples_per_mix + 1)
+  ])
+  def mix_inputs(ex):
+    samples = tf.reduce_sum(ex[inputs_feature_key], axis=0)
+    norm = tf.linalg.norm(samples, ord=np.inf)
+    ex[inputs_feature_key] = tf.math.divide_no_nan(samples, norm)
+    return ex
+  ds = ds.map(mix_inputs, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  max_tokens = sequence_length['targets']
+  if output_features['targets'].add_eos:
+    # Leave room to insert an EOS token.
+    max_tokens -= 1
+  def mix_targets(ex):
+    for k in targets_feature_keys:
+      ex[k] = run_length_encoding.merge_run_length_encoded_targets(
+          targets=ex[k],
+          codec=codec)
+    return ex
+  ds = ds.map(mix_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return ds

mt3/models.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature converter and model for continuous inputs."""
+from typing import Mapping
+import seqio
+from t5x import decoding
+from t5x import models
+import tensorflow as tf
+class ContinuousInputsEncDecFeatureConverter(seqio.FeatureConverter):
+  """Feature converter for an encoder-decoder with continuous inputs."""
+  TASK_FEATURES = {
+      "inputs": seqio.FeatureConverter.FeatureSpec(dtype=tf.float32, rank=2),
+      "targets": seqio.FeatureConverter.FeatureSpec(dtype=tf.int32),
+  }
+  MODEL_FEATURES = {
+      "encoder_input_tokens":
+          seqio.FeatureConverter.FeatureSpec(dtype=tf.float32, rank=2),
+      "decoder_target_tokens":
+          seqio.FeatureConverter.FeatureSpec(dtype=tf.int32),
+      "decoder_input_tokens":
+          seqio.FeatureConverter.FeatureSpec(dtype=tf.int32),
+      "decoder_loss_weights":
+          seqio.FeatureConverter.FeatureSpec(dtype=tf.int32),
+  }
+  PACKING_FEATURE_DTYPES = {
+      "encoder_segment_ids": tf.int32,
+      "decoder_segment_ids": tf.int32,
+      "encoder_positions": tf.int32,
+      "decoder_positions": tf.int32
+  }
+  def _convert_features(
+      self, ds: tf.data.Dataset,
+      task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
+    """Convert the dataset to be fed to the encoder-decoder model.
+    The conversion process involves three steps
+    1. Each feature in the `task_feature_lengths` is trimmed/padded and
+       optionally packed depending on the value of self.pack.
+    2. "inputs" fields are mapped to the encoder input and "targets" are mapped
+       to decoder input (after being shifted) and target.
+    All the keys in the `task_feature_lengths` should be present in the input
+    dataset, which may contain some extra features that are not in the
+    `task_feature_lengths`. They will not be included in the output dataset.
+    One common scenario is the "inputs_pretokenized" and "targets_pretokenized"
+    fields.
+    Args:
+      ds: an input tf.data.Dataset to be converted.
+      task_feature_lengths: a mapping from feature to its length.
+    Returns:
+      ds: the converted dataset.
+    """
+    def convert_example(
+        features: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
+      # targets_segment_id is present only for a packed dataset.
+      decoder_input_tokens = seqio.autoregressive_inputs(
+          features["targets"],
+          sequence_id=features.get("targets_segment_ids", None))
+      d = {"encoder_input_tokens": features["inputs"],
+           "decoder_target_tokens": features["targets"],
+           "decoder_input_tokens": decoder_input_tokens,
+           # Loss is computed for all but the padding positions.
+           "decoder_loss_weights":
+               seqio.non_padding_position(features["targets"])}
+      if self.pack:
+        d["encoder_segment_ids"] = features["inputs_segment_ids"]
+        d["decoder_segment_ids"] = features["targets_segment_ids"]
+        d["encoder_positions"] = features["inputs_positions"]
+        d["decoder_positions"] = features["targets_positions"]
+      return d
+    ds = self._pack_or_pad(ds, task_feature_lengths)
+    return ds.map(
+        convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  def get_model_feature_lengths(
+      self, task_feature_lengths: Mapping[str, int]) -> Mapping[str, int]:
+    """Define the length relationship between input and output features."""
+    encoder_length = task_feature_lengths["inputs"]
+    decoder_length = task_feature_lengths["targets"]
+    model_feature_lengths = {
+        "encoder_input_tokens": encoder_length,
+        "decoder_target_tokens": decoder_length,
+        "decoder_input_tokens": decoder_length,
+        "decoder_loss_weights": decoder_length
+    }
+    if self.pack:
+      model_feature_lengths["encoder_segment_ids"] = encoder_length
+      model_feature_lengths["decoder_segment_ids"] = decoder_length
+      model_feature_lengths["encoder_positions"] = encoder_length
+      model_feature_lengths["decoder_positions"] = decoder_length
+    return model_feature_lengths
+class ContinuousInputsEncoderDecoderModel(models.EncoderDecoderModel):
+  """Encoder-decoder model with continuous inputs."""
+  FEATURE_CONVERTER_CLS = ContinuousInputsEncDecFeatureConverter
+  def __init__(self, module, input_vocabulary, output_vocabulary, optimizer_def,
+               input_depth, decode_fn=decoding.beam_search, label_smoothing=0.0,
+               z_loss=0.0, loss_normalizing_factor=None):
+    super().__init__(
+        module=module,
+        input_vocabulary=input_vocabulary,
+        output_vocabulary=output_vocabulary,
+        optimizer_def=optimizer_def,
+        decode_fn=decode_fn,
+        label_smoothing=label_smoothing,
+        z_loss=z_loss,
+        loss_normalizing_factor=loss_normalizing_factor)
+    self._input_depth = input_depth
+  def get_initial_variables(self, rng, input_shapes, input_types=None):
+    """Hacky override to bypass eval/infer inability to handle rank-3 inputs."""
+    encoder_shape = input_shapes["encoder_input_tokens"]
+    if len(encoder_shape) == 2:
+      input_shapes = {
+          "encoder_input_tokens": (*encoder_shape, self._input_depth),
+          **{k: v for k, v in input_shapes.items()
+             if k != "encoder_input_tokens"}
+      }
+    else:
+      assert encoder_shape[-1] == self._input_depth
+    return super().get_initial_variables(
+        rng=rng, input_shapes=input_shapes, input_types=input_types)

mt3/network.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""T5.1.1 Transformer model."""
+from typing import Any, Sequence
+from flax import linen as nn
+from flax import struct
+import jax.numpy as jnp
+from mt3 import layers
+@struct.dataclass
+class T5Config:
+  """Global hyperparameters used to minimize obnoxious kwarg plumbing."""
+  vocab_size: int
+  # Activation dtypes.
+  dtype: Any = jnp.float32
+  emb_dim: int = 512
+  num_heads: int = 8
+  num_encoder_layers: int = 6
+  num_decoder_layers: int = 6
+  head_dim: int = 64
+  mlp_dim: int = 2048
+  # Activation functions are retrieved from Flax.
+  mlp_activations: Sequence[str] = ('relu',)
+  dropout_rate: float = 0.1
+  # If `True`, the embedding weights are used in the decoder output layer.
+  logits_via_embedding: bool = False
+class EncoderLayer(nn.Module):
+  """Transformer encoder layer."""
+  config: T5Config
+  @nn.compact
+  def __call__(self, inputs, encoder_mask=None, deterministic=False):
+    cfg = self.config
+    # Attention block.
+    assert inputs.ndim == 3
+    x = layers.LayerNorm(
+        dtype=cfg.dtype, name='pre_attention_layer_norm')(
+            inputs)
+    # [batch, length, emb_dim] -> [batch, length, emb_dim]
+    x = layers.MultiHeadDotProductAttention(
+        num_heads=cfg.num_heads,
+        dtype=cfg.dtype,
+        head_dim=cfg.head_dim,
+        dropout_rate=cfg.dropout_rate,
+        name='attention')(
+            x, x, encoder_mask, deterministic=deterministic)
+    x = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic)
+    x = x + inputs
+    # MLP block.
+    y = layers.LayerNorm(dtype=cfg.dtype, name='pre_mlp_layer_norm')(x)
+    # [batch, length, emb_dim] -> [batch, length, emb_dim]
+    y = layers.MlpBlock(
+        intermediate_dim=cfg.mlp_dim,
+        activations=cfg.mlp_activations,
+        intermediate_dropout_rate=cfg.dropout_rate,
+        dtype=cfg.dtype,
+        name='mlp',
+    )(y, deterministic=deterministic)
+    y = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            y, deterministic=deterministic)
+    y = y + x
+    return y
+class DecoderLayer(nn.Module):
+  """Transformer decoder layer that attends to the encoder."""
+  config: T5Config
+  @nn.compact
+  def __call__(self,
+               inputs,
+               encoded,
+               decoder_mask=None,
+               encoder_decoder_mask=None,
+               deterministic=False,
+               decode=False,
+               max_decode_length=None):
+    cfg = self.config
+    # inputs: embedded inputs to the decoder with shape [batch, length, emb_dim]
+    x = layers.LayerNorm(
+        dtype=cfg.dtype, name='pre_self_attention_layer_norm')(
+            inputs)
+    # Self-attention block
+    x = layers.MultiHeadDotProductAttention(
+        num_heads=cfg.num_heads,
+        dtype=cfg.dtype,
+        head_dim=cfg.head_dim,
+        dropout_rate=cfg.dropout_rate,
+        name='self_attention')(
+            x,
+            x,
+            decoder_mask,
+            deterministic=deterministic,
+            decode=decode)
+    x = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic)
+    x = x + inputs
+    # Encoder-Decoder block.
+    y = layers.LayerNorm(
+        dtype=cfg.dtype, name='pre_cross_attention_layer_norm')(
+            x)
+    y = layers.MultiHeadDotProductAttention(
+        num_heads=cfg.num_heads,
+        dtype=cfg.dtype,
+        head_dim=cfg.head_dim,
+        dropout_rate=cfg.dropout_rate,
+        name='encoder_decoder_attention')(
+            y, encoded, encoder_decoder_mask, deterministic=deterministic)
+    y = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            y, deterministic=deterministic)
+    y = y + x
+    # MLP block.
+    z = layers.LayerNorm(dtype=cfg.dtype, name='pre_mlp_layer_norm')(y)
+    z = layers.MlpBlock(
+        intermediate_dim=cfg.mlp_dim,
+        activations=cfg.mlp_activations,
+        intermediate_dropout_rate=cfg.dropout_rate,
+        dtype=cfg.dtype,
+        name='mlp',
+    )(z, deterministic=deterministic)
+    z = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            z, deterministic=deterministic)
+    z = z + y
+    return z
+class Encoder(nn.Module):
+  """A stack of encoder layers."""
+  config: T5Config
+  @nn.compact
+  def __call__(self,
+               encoder_input_tokens,
+               encoder_mask=None,
+               deterministic=False):
+    cfg = self.config
+    assert encoder_input_tokens.ndim == 3  # [batch, length, depth]
+    seq_length = encoder_input_tokens.shape[-2]
+    inputs_positions = jnp.arange(seq_length)[None, :]
+    # [batch, length, depth] -> [batch, length, emb_dim]
+    x = layers.DenseGeneral(
+        cfg.emb_dim,
+        dtype=cfg.dtype,
+        kernel_init=nn.linear.default_kernel_init,
+        kernel_axes=('vocab', 'embed'),
+        name='continuous_inputs_projection')(encoder_input_tokens)
+    x = x + layers.FixedEmbed(features=cfg.emb_dim)(inputs_positions)
+    x = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic)
+    x = x.astype(cfg.dtype)
+    for lyr in range(cfg.num_encoder_layers):
+      # [batch, length, emb_dim] -> [batch, length, emb_dim]
+      x = EncoderLayer(
+          config=cfg,
+          name=f'layers_{lyr}')(x, encoder_mask, deterministic)
+    x = layers.LayerNorm(dtype=cfg.dtype, name='encoder_norm')(x)
+    return nn.Dropout(rate=cfg.dropout_rate)(x, deterministic=deterministic)
+class Decoder(nn.Module):
+  """A stack of decoder layers as a part of an encoder-decoder architecture."""
+  config: T5Config
+  @nn.compact
+  def __call__(self,
+               encoded,
+               decoder_input_tokens,
+               decoder_positions=None,
+               decoder_mask=None,
+               encoder_decoder_mask=None,
+               deterministic=False,
+               decode=False,
+               max_decode_length=None):
+    cfg = self.config
+    assert decoder_input_tokens.ndim == 2  # [batch, len]
+    seq_length = decoder_input_tokens.shape[-1]
+    decoder_positions = jnp.arange(seq_length)[None, :]
+    # [batch, length] -> [batch, length, emb_dim]
+    y = layers.Embed(
+        num_embeddings=cfg.vocab_size,
+        features=cfg.emb_dim,
+        dtype=cfg.dtype,
+        attend_dtype=jnp.float32,  # for logit training stability
+        embedding_init=nn.initializers.normal(stddev=1.0),
+        one_hot=True,
+        name='token_embedder')(decoder_input_tokens.astype('int32'))
+    y = y + layers.FixedEmbed(features=cfg.emb_dim)(
+        decoder_positions, decode=decode)
+    y = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            y, deterministic=deterministic)
+    y = y.astype(cfg.dtype)
+    for lyr in range(cfg.num_decoder_layers):
+      # [batch, length, emb_dim] -> [batch, length, emb_dim]
+      y = DecoderLayer(
+          config=cfg, name=f'layers_{lyr}')(
+              y,
+              encoded,
+              decoder_mask=decoder_mask,
+              encoder_decoder_mask=encoder_decoder_mask,
+              deterministic=deterministic,
+              decode=decode,
+              max_decode_length=max_decode_length)
+    y = layers.LayerNorm(dtype=cfg.dtype, name='decoder_norm')(y)
+    y = nn.Dropout(
+        rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+            y, deterministic=deterministic)
+    # [batch, length, emb_dim] -> [batch, length, vocab_size]
+    if cfg.logits_via_embedding:
+      # Use the transpose of embedding matrix for logit transform.
+      logits = self.shared_embedding.attend(y)
+      # Correctly normalize pre-softmax logits for this shared case.
+      logits = logits / jnp.sqrt(y.shape[-1])
+    else:
+      logits = layers.DenseGeneral(
+          cfg.vocab_size,
+          dtype=jnp.float32,  # Use float32 for stabiliity.
+          kernel_axes=('embed', 'vocab'),
+          name='logits_dense')(
+              y)
+    return logits
+class Transformer(nn.Module):
+  """An encoder-decoder Transformer model."""
+  config: T5Config
+  def setup(self):
+    cfg = self.config
+    self.encoder = Encoder(config=cfg)
+    self.decoder = Decoder(config=cfg)
+  def encode(self,
+             encoder_input_tokens,
+             encoder_segment_ids=None,
+             enable_dropout=True):
+    """Applies Transformer encoder-branch on the inputs."""
+    cfg = self.config
+    assert encoder_input_tokens.ndim == 3  # (batch, length, depth)
+    # Make padding attention mask; we don't actually mask out any input
+    # positions, letting the model potentially attend to the zero vector used as
+    # padding.
+    encoder_mask = layers.make_attention_mask(
+        jnp.ones(encoder_input_tokens.shape[:-1]),
+        jnp.ones(encoder_input_tokens.shape[:-1]),
+        dtype=cfg.dtype)
+    # Add segmentation block-diagonal attention mask if using segmented data.
+    if encoder_segment_ids is not None:
+      encoder_mask = layers.combine_masks(
+          encoder_mask,
+          layers.make_attention_mask(
+              encoder_segment_ids,
+              encoder_segment_ids,
+              jnp.equal,
+              dtype=cfg.dtype))
+    return self.encoder(
+        encoder_input_tokens, encoder_mask, deterministic=not enable_dropout)
+  def decode(
+      self,
+      encoded,
+      encoder_input_tokens,  # only needed for masks
+      decoder_input_tokens,
+      decoder_target_tokens,
+      encoder_segment_ids=None,
+      decoder_segment_ids=None,
+      decoder_positions=None,
+      enable_dropout=True,
+      decode=False,
+      max_decode_length=None):
+    """Applies Transformer decoder-branch on encoded-input and target."""
+    cfg = self.config
+    # Make padding attention masks.
+    if decode:
+      # Do not mask decoder attention based on targets padding at
+      # decoding/inference time.
+      decoder_mask = None
+      encoder_decoder_mask = layers.make_attention_mask(
+          jnp.ones_like(decoder_target_tokens),
+          jnp.ones(encoder_input_tokens.shape[:-1]),
+          dtype=cfg.dtype)
+    else:
+      decoder_mask = layers.make_decoder_mask(
+          decoder_target_tokens=decoder_target_tokens,
+          dtype=cfg.dtype,
+          decoder_segment_ids=decoder_segment_ids)
+      encoder_decoder_mask = layers.make_attention_mask(
+          decoder_target_tokens > 0,
+          jnp.ones(encoder_input_tokens.shape[:-1]),
+          dtype=cfg.dtype)
+    # Add segmentation block-diagonal attention masks if using segmented data.
+    if encoder_segment_ids is not None:
+      if decode:
+        raise ValueError(
+            'During decoding, packing should not be used but '
+            '`encoder_segment_ids` was passed to `Transformer.decode`.')
+      encoder_decoder_mask = layers.combine_masks(
+          encoder_decoder_mask,
+          layers.make_attention_mask(
+              decoder_segment_ids,
+              encoder_segment_ids,
+              jnp.equal,
+              dtype=cfg.dtype))
+    logits = self.decoder(
+        encoded,
+        decoder_input_tokens=decoder_input_tokens,
+        decoder_positions=decoder_positions,
+        decoder_mask=decoder_mask,
+        encoder_decoder_mask=encoder_decoder_mask,
+        deterministic=not enable_dropout,
+        decode=decode,
+        max_decode_length=max_decode_length)
+    return logits.astype(self.config.dtype)
+  def __call__(self,
+               encoder_input_tokens,
+               decoder_input_tokens,
+               decoder_target_tokens,
+               encoder_segment_ids=None,
+               decoder_segment_ids=None,
+               encoder_positions=None,
+               decoder_positions=None,
+               *,
+               enable_dropout: bool = True,
+               decode: bool = False):
+    """Applies Transformer model on the inputs.
+    This method requires both decoder_target_tokens and decoder_input_tokens,
+    which is a shifted version of the former. For a packed dataset, it usually
+    has additional processing applied. For example, the first element of each
+    sequence has id 0 instead of the shifted EOS id from the previous sequence.
+    Args:
+      encoder_input_tokens: input data to the encoder.
+      decoder_input_tokens: input token to the decoder.
+      decoder_target_tokens: target token to the decoder.
+      encoder_segment_ids: encoder segmentation info for packed examples.
+      decoder_segment_ids: decoder segmentation info for packed examples.
+      encoder_positions: encoder subsequence positions for packed examples.
+      decoder_positions: decoder subsequence positions for packed examples.
+      enable_dropout: Ensables dropout if set to True.
+      decode: Whether to prepare and use an autoregressive cache.
+    Returns:
+      logits array from full transformer.
+    """
+    encoded = self.encode(
+        encoder_input_tokens,
+        encoder_segment_ids=encoder_segment_ids,
+        enable_dropout=enable_dropout)
+    return self.decode(
+        encoded,
+        encoder_input_tokens,  # only used for masks
+        decoder_input_tokens,
+        decoder_target_tokens,
+        encoder_segment_ids=encoder_segment_ids,
+        decoder_segment_ids=decoder_segment_ids,
+        decoder_positions=decoder_positions,
+        enable_dropout=enable_dropout,
+        decode=decode)

mt3/note_sequences.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions that operate on NoteSequence protos."""
+import dataclasses
+import itertools
+from typing import MutableMapping, MutableSet, Optional, Sequence, Tuple
+from mt3 import event_codec
+from mt3 import run_length_encoding
+from mt3 import vocabularies
+import note_seq
+DEFAULT_VELOCITY = 100
+DEFAULT_NOTE_DURATION = 0.01
+# Quantization can result in zero-length notes; enforce a minimum duration.
+MIN_NOTE_DURATION = 0.01
+@dataclasses.dataclass
+class TrackSpec:
+  name: str
+  program: int = 0
+  is_drum: bool = False
+def extract_track(ns, program, is_drum):
+  track = note_seq.NoteSequence(ticks_per_quarter=220)
+  track_notes = [note for note in ns.notes
+                 if note.program == program and note.is_drum == is_drum]
+  track.notes.extend(track_notes)
+  track.total_time = (max(note.end_time for note in track.notes)
+                      if track.notes else 0.0)
+  return track
+def trim_overlapping_notes(ns: note_seq.NoteSequence) -> note_seq.NoteSequence:
+  """Trim overlapping notes from a NoteSequence, dropping zero-length notes."""
+  ns_trimmed = note_seq.NoteSequence()
+  ns_trimmed.CopyFrom(ns)
+  channels = set((note.pitch, note.program, note.is_drum)
+                 for note in ns_trimmed.notes)
+  for pitch, program, is_drum in channels:
+    notes = [note for note in ns_trimmed.notes if note.pitch == pitch
+             and note.program == program and note.is_drum == is_drum]
+    sorted_notes = sorted(notes, key=lambda note: note.start_time)
+    for i in range(1, len(sorted_notes)):
+      if sorted_notes[i - 1].end_time > sorted_notes[i].start_time:
+        sorted_notes[i - 1].end_time = sorted_notes[i].start_time
+  valid_notes = [note for note in ns_trimmed.notes
+                 if note.start_time < note.end_time]
+  del ns_trimmed.notes[:]
+  ns_trimmed.notes.extend(valid_notes)
+  return ns_trimmed
+def assign_instruments(ns: note_seq.NoteSequence) -> None:
+  """Assign instrument numbers to notes; modifies NoteSequence in place."""
+  program_instruments = {}
+  for note in ns.notes:
+    if note.program not in program_instruments and not note.is_drum:
+      num_instruments = len(program_instruments)
+      note.instrument = (num_instruments if num_instruments < 9
+                         else num_instruments + 1)
+      program_instruments[note.program] = note.instrument
+    elif note.is_drum:
+      note.instrument = 9
+    else:
+      note.instrument = program_instruments[note.program]
+def validate_note_sequence(ns: note_seq.NoteSequence) -> None:
+  """Raise ValueError if NoteSequence contains invalid notes."""
+  for note in ns.notes:
+    if note.start_time >= note.end_time:
+      raise ValueError('note has start time >= end time: %f >= %f' %
+                       (note.start_time, note.end_time))
+    if note.velocity == 0:
+      raise ValueError('note has zero velocity')
+def note_arrays_to_note_sequence(
+    onset_times: Sequence[float],
+    pitches: Sequence[int],
+    offset_times: Optional[Sequence[float]] = None,
+    velocities: Optional[Sequence[int]] = None,
+    programs: Optional[Sequence[int]] = None,
+    is_drums: Optional[Sequence[bool]] = None
+) -> note_seq.NoteSequence:
+  """Convert note onset / offset / pitch / velocity arrays to NoteSequence."""
+  ns = note_seq.NoteSequence(ticks_per_quarter=220)
+  for onset_time, offset_time, pitch, velocity, program, is_drum in itertools.zip_longest(
+      onset_times, [] if offset_times is None else offset_times,
+      pitches, [] if velocities is None else velocities,
+      [] if programs is None else programs,
+      [] if is_drums is None else is_drums):
+    if offset_time is None:
+      offset_time = onset_time + DEFAULT_NOTE_DURATION
+    if velocity is None:
+      velocity = DEFAULT_VELOCITY
+    if program is None:
+      program = 0
+    if is_drum is None:
+      is_drum = False
+    ns.notes.add(
+        start_time=onset_time,
+        end_time=offset_time,
+        pitch=pitch,
+        velocity=velocity,
+        program=program,
+        is_drum=is_drum)
+    ns.total_time = max(ns.total_time, offset_time)
+  assign_instruments(ns)
+  return ns
+@dataclasses.dataclass
+class NoteEventData:
+  pitch: int
+  velocity: Optional[int] = None
+  program: Optional[int] = None
+  is_drum: Optional[bool] = None
+  instrument: Optional[int] = None
+def note_sequence_to_onsets(
+    ns: note_seq.NoteSequence
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+  """Extract note onsets and pitches from NoteSequence proto."""
+  # Sort by pitch to use as a tiebreaker for subsequent stable sort.
+  notes = sorted(ns.notes, key=lambda note: note.pitch)
+  return ([note.start_time for note in notes],
+          [NoteEventData(pitch=note.pitch) for note in notes])
+def note_sequence_to_onsets_and_offsets(
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+  """Extract onset & offset times and pitches from a NoteSequence proto.
+  The onset & offset times will not necessarily be in sorted order.
+  Args:
+    ns: NoteSequence from which to extract onsets and offsets.
+  Returns:
+    times: A list of note onset and offset times.
+    values: A list of NoteEventData objects where velocity is zero for note
+        offsets.
+  """
+  # Sort by pitch and put offsets before onsets as a tiebreaker for subsequent
+  # stable sort.
+  notes = sorted(ns.notes, key=lambda note: note.pitch)
+  times = ([note.end_time for note in notes] +
+           [note.start_time for note in notes])
+  values = ([NoteEventData(pitch=note.pitch, velocity=0) for note in notes] +
+            [NoteEventData(pitch=note.pitch, velocity=note.velocity)
+             for note in notes])
+  return times, values
+def note_sequence_to_onsets_and_offsets_and_programs(
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+  """Extract onset & offset times and pitches & programs from a NoteSequence.
+  The onset & offset times will not necessarily be in sorted order.
+  Args:
+    ns: NoteSequence from which to extract onsets and offsets.
+  Returns:
+    times: A list of note onset and offset times.
+    values: A list of NoteEventData objects where velocity is zero for note
+        offsets.
+  """
+  # Sort by program and pitch and put offsets before onsets as a tiebreaker for
+  # subsequent stable sort.
+  notes = sorted(ns.notes,
+                 key=lambda note: (note.is_drum, note.program, note.pitch))
+  times = ([note.end_time for note in notes if not note.is_drum] +
+           [note.start_time for note in notes])
+  values = ([NoteEventData(pitch=note.pitch, velocity=0,
+                           program=note.program, is_drum=False)
+             for note in notes if not note.is_drum] +
+            [NoteEventData(pitch=note.pitch, velocity=note.velocity,
+                           program=note.program, is_drum=note.is_drum)
+             for note in notes])
+  return times, values
+@dataclasses.dataclass
+class NoteEncodingState:
+  """Encoding state for note transcription, keeping track of active pitches."""
+  # velocity bin for active pitches and programs
+  active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(
+      default_factory=dict)
+def note_event_data_to_events(
+    state: Optional[NoteEncodingState],
+    value: NoteEventData,
+    codec: event_codec.Codec,
+) -> Sequence[event_codec.Event]:
+  """Convert note event data to a sequence of events."""
+  if value.velocity is None:
+    # onsets only, no program or velocity
+    return [event_codec.Event('pitch', value.pitch)]
+  else:
+    num_velocity_bins = vocabularies.num_velocity_bins_from_codec(codec)
+    velocity_bin = vocabularies.velocity_to_bin(
+        value.velocity, num_velocity_bins)
+    if value.program is None:
+      # onsets + offsets + velocities only, no programs
+      if state is not None:
+        state.active_pitches[(value.pitch, 0)] = velocity_bin
+      return [event_codec.Event('velocity', velocity_bin),
+              event_codec.Event('pitch', value.pitch)]
+    else:
+      if value.is_drum:
+        # drum events use a separate vocabulary
+        return [event_codec.Event('velocity', velocity_bin),
+                event_codec.Event('drum', value.pitch)]
+      else:
+        # program + velocity + pitch
+        if state is not None:
+          state.active_pitches[(value.pitch, value.program)] = velocity_bin
+        return [event_codec.Event('program', value.program),
+                event_codec.Event('velocity', velocity_bin),
+                event_codec.Event('pitch', value.pitch)]
+def note_encoding_state_to_events(
+    state: NoteEncodingState
+) -> Sequence[event_codec.Event]:
+  """Output program and pitch events for active notes plus a final tie event."""
+  events = []
+  for pitch, program in sorted(
+      state.active_pitches.keys(), key=lambda k: k[::-1]):
+    if state.active_pitches[(pitch, program)]:
+      events += [event_codec.Event('program', program),
+                 event_codec.Event('pitch', pitch)]
+  events.append(event_codec.Event('tie', 0))
+  return events
+@dataclasses.dataclass
+class NoteDecodingState:
+  """Decoding state for note transcription."""
+  current_time: float = 0.0
+  # velocity to apply to subsequent pitch events (zero for note-off)
+  current_velocity: int = DEFAULT_VELOCITY
+  # program to apply to subsequent pitch events
+  current_program: int = 0
+  # onset time and velocity for active pitches and programs
+  active_pitches: MutableMapping[Tuple[int, int],
+                                 Tuple[float, int]] = dataclasses.field(
+                                     default_factory=dict)
+  # pitches (with programs) to continue from previous segment
+  tied_pitches: MutableSet[Tuple[int, int]] = dataclasses.field(
+      default_factory=set)
+  # whether or not we are in the tie section at the beginning of a segment
+  is_tie_section: bool = False
+  # partially-decoded NoteSequence
+  note_sequence: note_seq.NoteSequence = dataclasses.field(
+      default_factory=lambda: note_seq.NoteSequence(ticks_per_quarter=220))
+def decode_note_onset_event(
+    state: NoteDecodingState,
+    time: float,
+    event: event_codec.Event,
+    codec: event_codec.Codec,
+) -> None:
+  """Process note onset event and update decoding state."""
+  if event.type == 'pitch':
+    state.note_sequence.notes.add(
+        start_time=time, end_time=time + DEFAULT_NOTE_DURATION,
+        pitch=event.value, velocity=DEFAULT_VELOCITY)
+    state.note_sequence.total_time = max(state.note_sequence.total_time,
+                                         time + DEFAULT_NOTE_DURATION)
+  else:
+    raise ValueError('unexpected event type: %s' % event.type)
+def _add_note_to_sequence(
+    ns: note_seq.NoteSequence,
+    start_time: float, end_time: float, pitch: int, velocity: int,
+    program: int = 0, is_drum: bool = False
+) -> None:
+  end_time = max(end_time, start_time + MIN_NOTE_DURATION)
+  ns.notes.add(
+      start_time=start_time, end_time=end_time,
+      pitch=pitch, velocity=velocity, program=program, is_drum=is_drum)
+  ns.total_time = max(ns.total_time, end_time)
+def decode_note_event(
+    state: NoteDecodingState,
+    time: float,
+    event: event_codec.Event,
+    codec: event_codec.Codec
+) -> None:
+  """Process note event and update decoding state."""
+  if time < state.current_time:
+    raise ValueError('event time < current time, %f < %f' % (
+        time, state.current_time))
+  state.current_time = time
+  if event.type == 'pitch':
+    pitch = event.value
+    if state.is_tie_section:
+      # "tied" pitch
+      if (pitch, state.current_program) not in state.active_pitches:
+        raise ValueError('inactive pitch/program in tie section: %d/%d' %
+                         (pitch, state.current_program))
+      if (pitch, state.current_program) in state.tied_pitches:
+        raise ValueError('pitch/program is already tied: %d/%d' %
+                         (pitch, state.current_program))
+      state.tied_pitches.add((pitch, state.current_program))
+    elif state.current_velocity == 0:
+      # note offset
+      if (pitch, state.current_program) not in state.active_pitches:
+        raise ValueError('note-off for inactive pitch/program: %d/%d' %
+                         (pitch, state.current_program))
+      onset_time, onset_velocity = state.active_pitches.pop(
+          (pitch, state.current_program))
+      _add_note_to_sequence(
+          state.note_sequence, start_time=onset_time, end_time=time,
+          pitch=pitch, velocity=onset_velocity, program=state.current_program)
+    else:
+      # note onset
+      if (pitch, state.current_program) in state.active_pitches:
+        # The pitch is already active; this shouldn't really happen but we'll
+        # try to handle it gracefully by ending the previous note and starting a
+        # new one.
+        onset_time, onset_velocity = state.active_pitches.pop(
+            (pitch, state.current_program))
+        _add_note_to_sequence(
+            state.note_sequence, start_time=onset_time, end_time=time,
+            pitch=pitch, velocity=onset_velocity, program=state.current_program)
+      state.active_pitches[(pitch, state.current_program)] = (
+          time, state.current_velocity)
+  elif event.type == 'drum':
+    # drum onset (drums have no offset)
+    if state.current_velocity == 0:
+      raise ValueError('velocity cannot be zero for drum event')
+    offset_time = time + DEFAULT_NOTE_DURATION
+    _add_note_to_sequence(
+        state.note_sequence, start_time=time, end_time=offset_time,
+        pitch=event.value, velocity=state.current_velocity, is_drum=True)
+  elif event.type == 'velocity':
+    # velocity change
+    num_velocity_bins = vocabularies.num_velocity_bins_from_codec(codec)
+    velocity = vocabularies.bin_to_velocity(event.value, num_velocity_bins)
+    state.current_velocity = velocity
+  elif event.type == 'program':
+    # program change
+    state.current_program = event.value
+  elif event.type == 'tie':
+    # end of tie section; end active notes that weren't declared tied
+    if not state.is_tie_section:
+      raise ValueError('tie section end event when not in tie section')
+    for (pitch, program) in list(state.active_pitches.keys()):
+      if (pitch, program) not in state.tied_pitches:
+        onset_time, onset_velocity = state.active_pitches.pop((pitch, program))
+        _add_note_to_sequence(
+            state.note_sequence,
+            start_time=onset_time, end_time=state.current_time,
+            pitch=pitch, velocity=onset_velocity, program=program)
+    state.is_tie_section = False
+  else:
+    raise ValueError('unexpected event type: %s' % event.type)
+def begin_tied_pitches_section(state: NoteDecodingState) -> None:
+  """Begin the tied pitches section at the start of a segment."""
+  state.tied_pitches = set()
+  state.is_tie_section = True
+def flush_note_decoding_state(
+    state: NoteDecodingState
+) -> note_seq.NoteSequence:
+  """End all active notes and return resulting NoteSequence."""
+  for onset_time, _ in state.active_pitches.values():
+    state.current_time = max(state.current_time, onset_time + MIN_NOTE_DURATION)
+  for (pitch, program) in list(state.active_pitches.keys()):
+    onset_time, onset_velocity = state.active_pitches.pop((pitch, program))
+    _add_note_to_sequence(
+        state.note_sequence, start_time=onset_time, end_time=state.current_time,
+        pitch=pitch, velocity=onset_velocity, program=program)
+  assign_instruments(state.note_sequence)
+  return state.note_sequence
+class NoteEncodingSpecType(run_length_encoding.EventEncodingSpec):
+  pass
+# encoding spec for modeling note onsets only
+NoteOnsetEncodingSpec = NoteEncodingSpecType(
+    init_encoding_state_fn=lambda: None,
+    encode_event_fn=note_event_data_to_events,
+    encoding_state_to_events_fn=None,
+    init_decoding_state_fn=NoteDecodingState,
+    begin_decoding_segment_fn=lambda state: None,
+    decode_event_fn=decode_note_onset_event,
+    flush_decoding_state_fn=lambda state: state.note_sequence)
+# encoding spec for modeling onsets and offsets
+NoteEncodingSpec = NoteEncodingSpecType(
+    init_encoding_state_fn=lambda: None,
+    encode_event_fn=note_event_data_to_events,
+    encoding_state_to_events_fn=None,
+    init_decoding_state_fn=NoteDecodingState,
+    begin_decoding_segment_fn=lambda state: None,
+    decode_event_fn=decode_note_event,
+    flush_decoding_state_fn=flush_note_decoding_state)
+# encoding spec for modeling onsets and offsets, with a "tie" section at the
+# beginning of each segment listing already-active notes
+NoteEncodingWithTiesSpec = NoteEncodingSpecType(
+    init_encoding_state_fn=NoteEncodingState,
+    encode_event_fn=note_event_data_to_events,
+    encoding_state_to_events_fn=note_encoding_state_to_events,
+    init_decoding_state_fn=NoteDecodingState,
+    begin_decoding_segment_fn=begin_tied_pitches_section,
+    decode_event_fn=decode_note_event,
+    flush_decoding_state_fn=flush_note_decoding_state)

mt3/note_sequences_test.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for note_sequences."""
+from mt3 import event_codec
+from mt3 import note_sequences
+from mt3 import run_length_encoding
+import note_seq
+import numpy as np
+import tensorflow as tf
+codec = event_codec.Codec(
+    max_shift_steps=100,
+    steps_per_second=100,
+    event_ranges=[
+        event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                               note_seq.MAX_MIDI_PITCH),
+        event_codec.EventRange('velocity', 0, 127),
+        event_codec.EventRange('drum', note_seq.MIN_MIDI_PITCH,
+                               note_seq.MAX_MIDI_PITCH),
+        event_codec.EventRange('program', note_seq.MIN_MIDI_PROGRAM,
+                               note_seq.MAX_MIDI_PROGRAM),
+        event_codec.EventRange('tie', 0, 0)
+    ])
+class RunLengthEncodingTest(tf.test.TestCase):
+  def test_encode_and_index_note_sequence(self):
+    ns = note_seq.NoteSequence()
+    ns.notes.add(start_time=1.0,
+                 end_time=1.1,
+                 pitch=61,
+                 velocity=100)
+    ns.notes.add(start_time=2.0,
+                 end_time=2.1,
+                 pitch=62,
+                 velocity=100)
+    ns.notes.add(start_time=3.0,
+                 end_time=3.1,
+                 pitch=63,
+                 velocity=100)
+    ns.total_time = ns.notes[-1].end_time
+    frame_times = np.arange(0, 4, step=.001)
+    event_times, event_values = note_sequences.note_sequence_to_onsets(ns)
+    events, event_start_indices, event_end_indices, _, _ = run_length_encoding.encode_and_index_events(
+        state=None, event_times=event_times, event_values=event_values,
+        encode_event_fn=note_sequences.note_event_data_to_events,
+        codec=codec, frame_times=frame_times)
+    self.assertEqual(len(frame_times), len(event_start_indices))
+    self.assertEqual(len(frame_times), len(event_end_indices))
+    self.assertLen(events, 403)
+    expected_events = ([1] * 100 +
+                       [162] +
+                       [1] * 100 +
+                       [163] +
+                       [1] * 100 +
+                       [164] +
+                       [1] * 100)
+    np.testing.assert_array_equal(expected_events, events)
+    self.assertEqual(event_start_indices[0], 0)
+    self.assertEqual(event_end_indices[0], 0)
+    self.assertEqual(162, events[100])
+    self.assertEqual(1.0, frame_times[1000])
+    self.assertEqual(event_start_indices[1000], 100)
+    self.assertEqual(event_end_indices[1000], 100)
+    self.assertEqual(163, events[201])
+    self.assertEqual(2.0, frame_times[2000])
+    self.assertEqual(event_start_indices[2000], 201)
+    self.assertEqual(event_end_indices[2000], 201)
+    self.assertEqual(164, events[302])
+    self.assertEqual(3.0, frame_times[3000])
+    self.assertEqual(event_start_indices[3000], 302)
+    self.assertEqual(event_end_indices[3000], 302)
+    self.assertEqual(1, events[-1])
+    self.assertEqual(3.999, frame_times[-1])
+    self.assertEqual(event_start_indices[-1], 402)
+    self.assertEqual(event_end_indices[-1], len(expected_events))
+  def test_encode_and_index_note_sequence_velocity(self):
+    ns = note_seq.NoteSequence()
+    ns.notes.add(start_time=1.0,
+                 end_time=3.0,
+                 pitch=61,
+                 velocity=1)
+    ns.notes.add(start_time=2.0,
+                 end_time=4.0,
+                 pitch=62,
+                 velocity=127)
+    ns.total_time = ns.notes[-1].end_time
+    frame_times = np.arange(0, 4, step=.001)
+    event_times, event_values = (
+        note_sequences.note_sequence_to_onsets_and_offsets(ns))
+    events, event_start_indices, event_end_indices, _, _ = run_length_encoding.encode_and_index_events(
+        state=None, event_times=event_times, event_values=event_values,
+        encode_event_fn=note_sequences.note_event_data_to_events,
+        codec=codec, frame_times=frame_times)
+    self.assertEqual(len(frame_times), len(event_start_indices))
+    self.assertEqual(len(frame_times), len(event_end_indices))
+    self.assertLen(events, 408)
+    expected_events = ([1] * 100 +
+                       [230, 162] +
+                       [1] * 100 +
+                       [356, 163] +
+                       [1] * 100 +
+                       [229, 162] +
+                       [1] * 100 +
+                       [229, 163])
+    np.testing.assert_array_equal(expected_events, events)
+    self.assertEqual(event_start_indices[0], 0)
+    self.assertEqual(event_end_indices[0], 0)
+    self.assertEqual(230, events[100])
+    self.assertEqual(162, events[101])
+    self.assertEqual(1.0, frame_times[1000])
+    self.assertEqual(event_start_indices[1000], 100)
+    self.assertEqual(event_end_indices[1000], 100)
+    self.assertEqual(356, events[202])
+    self.assertEqual(163, events[203])
+    self.assertEqual(2.0, frame_times[2000])
+    self.assertEqual(event_start_indices[2000], 202)
+    self.assertEqual(event_end_indices[2000], 202)
+    self.assertEqual(229, events[304])
+    self.assertEqual(162, events[305])
+    self.assertEqual(3.0, frame_times[3000])
+    self.assertEqual(event_start_indices[3000], 304)
+    self.assertEqual(event_end_indices[3000], 304)
+    self.assertEqual(229, events[406])
+    self.assertEqual(163, events[407])
+    self.assertEqual(3.999, frame_times[-1])
+    self.assertEqual(event_start_indices[-1], 405)
+    self.assertEqual(event_end_indices[-1], len(expected_events))
+  def test_encode_and_index_note_sequence_multitrack(self):
+    ns = note_seq.NoteSequence()
+    ns.notes.add(start_time=0.0,
+                 end_time=1.0,
+                 pitch=37,
+                 velocity=127,
+                 is_drum=True)
+    ns.notes.add(start_time=1.0,
+                 end_time=3.0,
+                 pitch=61,
+                 velocity=127,
+                 program=0)
+    ns.notes.add(start_time=2.0,
+                 end_time=4.0,
+                 pitch=62,
+                 velocity=127,
+                 program=40)
+    ns.total_time = ns.notes[-1].end_time
+    frame_times = np.arange(0, 4, step=.001)
+    event_times, event_values = (
+        note_sequences.note_sequence_to_onsets_and_offsets_and_programs(ns))
+    (tokens, event_start_indices, event_end_indices, state_tokens,
+     state_event_indices) = run_length_encoding.encode_and_index_events(
+         state=note_sequences.NoteEncodingState(),
+         event_times=event_times, event_values=event_values,
+         encode_event_fn=note_sequences.note_event_data_to_events,
+         codec=codec, frame_times=frame_times,
+         encoding_state_to_events_fn=(
+             note_sequences.note_encoding_state_to_events))
+    self.assertEqual(len(frame_times), len(event_start_indices))
+    self.assertEqual(len(frame_times), len(event_end_indices))
+    self.assertEqual(len(frame_times), len(state_event_indices))
+    self.assertLen(tokens, 414)
+    expected_events = (
+        [event_codec.Event('velocity', 127), event_codec.Event('drum', 37)] +
+        [event_codec.Event('shift', 1)] * 100 +
+        [event_codec.Event('program', 0),
+         event_codec.Event('velocity', 127), event_codec.Event('pitch', 61)] +
+        [event_codec.Event('shift', 1)] * 100 +
+        [event_codec.Event('program', 40),
+         event_codec.Event('velocity', 127), event_codec.Event('pitch', 62)] +
+        [event_codec.Event('shift', 1)] * 100 +
+        [event_codec.Event('program', 0),
+         event_codec.Event('velocity', 0), event_codec.Event('pitch', 61)] +
+        [event_codec.Event('shift', 1)] * 100 +
+        [event_codec.Event('program', 40),
+         event_codec.Event('velocity', 0), event_codec.Event('pitch', 62)])
+    expected_tokens = [codec.encode_event(e) for e in expected_events]
+    np.testing.assert_array_equal(expected_tokens, tokens)
+    expected_state_events = [
+        event_codec.Event('tie', 0),       # state prior to first drum
+        event_codec.Event('tie', 0),       # state prior to first onset
+        event_codec.Event('program', 0),   # state prior to second onset
+        event_codec.Event('pitch', 61),    # |
+        event_codec.Event('tie', 0),       # |
+        event_codec.Event('program', 0),   # state prior to first offset
+        event_codec.Event('pitch', 61),    # |
+        event_codec.Event('program', 40),  # |
+        event_codec.Event('pitch', 62),    # |
+        event_codec.Event('tie', 0),       # |
+        event_codec.Event('program', 40),  # state prior to second offset
+        event_codec.Event('pitch', 62),    # |
+        event_codec.Event('tie', 0)        # |
+    ]
+    expected_state_tokens = [codec.encode_event(e)
+                             for e in expected_state_events]
+    np.testing.assert_array_equal(expected_state_tokens, state_tokens)
+    self.assertEqual(event_start_indices[0], 0)
+    self.assertEqual(event_end_indices[0], 0)
+    self.assertEqual(state_event_indices[0], 0)
+    self.assertEqual(1.0, frame_times[1000])
+    self.assertEqual(event_start_indices[1000], 102)
+    self.assertEqual(event_end_indices[1000], 102)
+    self.assertEqual(state_event_indices[1000], 1)
+    self.assertEqual(2.0, frame_times[2000])
+    self.assertEqual(event_start_indices[2000], 205)
+    self.assertEqual(event_end_indices[2000], 205)
+    self.assertEqual(state_event_indices[2000], 2)
+    self.assertEqual(3.0, frame_times[3000])
+    self.assertEqual(event_start_indices[3000], 308)
+    self.assertEqual(event_end_indices[3000], 308)
+    self.assertEqual(state_event_indices[3000], 5)
+    self.assertEqual(3.999, frame_times[-1])
+    self.assertEqual(event_start_indices[-1], 410)
+    self.assertEqual(event_end_indices[-1], len(expected_events))
+    self.assertEqual(state_event_indices[-1], 10)
+  def test_encode_and_index_note_sequence_last_token_alignment(self):
+    ns = note_seq.NoteSequence()
+    ns.notes.add(start_time=0.0,
+                 end_time=0.1,
+                 pitch=60,
+                 velocity=100)
+    ns.total_time = ns.notes[-1].end_time
+    frame_times = np.arange(0, 1.008, step=.008)
+    event_times, event_values = note_sequences.note_sequence_to_onsets(ns)
+    events, event_start_indices, event_end_indices, _, _ = run_length_encoding.encode_and_index_events(
+        state=None,
+        event_times=event_times,
+        event_values=event_values,
+        encode_event_fn=note_sequences.note_event_data_to_events,
+        codec=codec,
+        frame_times=frame_times)
+    self.assertEqual(len(frame_times), len(event_start_indices))
+    self.assertEqual(len(frame_times), len(event_end_indices))
+    self.assertLen(events, 102)
+    expected_events = [161] + [1] * 101
+    np.testing.assert_array_equal(expected_events, events)
+    self.assertEqual(event_start_indices[0], 0)
+    self.assertEqual(event_end_indices[0], 0)
+    self.assertEqual(event_start_indices[125], 101)
+    self.assertEqual(event_end_indices[125], 102)
+  def test_decode_note_sequence_events(self):
+    events = [25, 161, 50, 162]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_onset_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=0.25,
+        end_time=0.26)
+    expected_ns.notes.add(
+        pitch=61,
+        velocity=100,
+        start_time=0.50,
+        end_time=0.51)
+    expected_ns.total_time = 0.51
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_onsets_only(self):
+    events = [5, 161, 25, 162]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_onset_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=0.05,
+        end_time=0.06)
+    expected_ns.notes.add(
+        pitch=61,
+        velocity=100,
+        start_time=0.25,
+        end_time=0.26)
+    expected_ns.total_time = 0.26
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_velocity(self):
+    events = [5, 356, 161, 25, 229, 161]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=127,
+        start_time=0.05,
+        end_time=0.25)
+    expected_ns.total_time = 0.25
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_missing_offset(self):
+    events = [5, 356, 161, 10, 161, 25, 229, 161]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=127,
+        start_time=0.05,
+        end_time=0.10)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=127,
+        start_time=0.10,
+        end_time=0.25)
+    expected_ns.total_time = 0.25
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_multitrack(self):
+    events = [5, 525, 356, 161, 15, 356, 394, 25, 525, 229, 161]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=37,
+        velocity=127,
+        start_time=0.15,
+        end_time=0.16,
+        instrument=9,
+        is_drum=True)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=127,
+        start_time=0.05,
+        end_time=0.25,
+        program=40)
+    expected_ns.total_time = 0.25
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_invalid_tokens(self):
+    events = [5, -1, 161, -2, 25, 162, 9999]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_events, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_onset_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(3, invalid_events)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=0.05,
+        end_time=0.06)
+    expected_ns.notes.add(
+        pitch=61,
+        velocity=100,
+        start_time=0.25,
+        end_time=0.26)
+    expected_ns.total_time = 0.26
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_allow_event_at_exactly_max_time(self):
+    events = [161, 25, 162]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=1.0, max_time=1.25,
+        codec=codec, decode_event_fn=note_sequences.decode_note_onset_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=1.00,
+        end_time=1.01)
+    expected_ns.notes.add(
+        pitch=61,
+        velocity=100,
+        start_time=1.25,
+        end_time=1.26)
+    expected_ns.total_time = 1.26
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_dropped_events(self):
+    events = [5, 161, 30, 162]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=1.0, max_time=1.25,
+        codec=codec, decode_event_fn=note_sequences.decode_note_onset_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(0, invalid_ids)
+    self.assertEqual(2, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=1.05,
+        end_time=1.06)
+    expected_ns.total_time = 1.06
+    self.assertProtoEquals(expected_ns, ns)
+  def test_decode_note_sequence_events_invalid_events(self):
+    events = [25, 230, 50, 161]
+    decoding_state = note_sequences.NoteDecodingState()
+    invalid_ids, dropped_events = run_length_encoding.decode_events(
+        state=decoding_state, tokens=events, start_time=0, max_time=None,
+        codec=codec, decode_event_fn=note_sequences.decode_note_onset_event)
+    ns = note_sequences.flush_note_decoding_state(decoding_state)
+    self.assertEqual(1, invalid_ids)
+    self.assertEqual(0, dropped_events)
+    expected_ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    expected_ns.notes.add(
+        pitch=60,
+        velocity=100,
+        start_time=0.50,
+        end_time=0.51)
+    expected_ns.total_time = 0.51
+    self.assertProtoEquals(expected_ns, ns)
+if __name__ == '__main__':
+  tf.test.main()

mt3/preprocessors.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transcription preprocessors."""
+from typing import Any, Callable, Mapping, Optional, Sequence, Tuple
+from absl import logging
+import gin
+from immutabledict import immutabledict
+import librosa
+from mt3 import event_codec
+from mt3 import note_sequences
+from mt3 import run_length_encoding
+from mt3 import spectrograms
+from mt3 import vocabularies
+import note_seq
+import numpy as np
+import seqio
+import tensorflow as tf
+def add_unique_id(ds: tf.data.Dataset) -> tf.data.Dataset:
+  """Add unique integer ID to each example in a dataset."""
+  def add_id_field(i, ex):
+    ex['unique_id'] = [i]
+    return ex
+  return ds.enumerate().map(
+      add_id_field, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+@seqio.map_over_dataset
+def pad_notesequence_array(ex):
+  """Pad the NoteSequence array so that it can later be "split"."""
+  ex['sequence'] = tf.pad(tf.expand_dims(ex['sequence'], 0),
+                          [[0, len(ex['input_times']) - 1]])
+  return ex
+@seqio.map_over_dataset
+def add_dummy_targets(ex):
+  """Add dummy targets; used in eval when targets are not actually used."""
+  ex['targets'] = np.array([], dtype=np.int32)
+  return ex
+def _audio_to_frames(
+    samples: Sequence[float],
+    spectrogram_config: spectrograms.SpectrogramConfig,
+) -> Tuple[Sequence[Sequence[int]], np.ndarray]:
+  """Convert audio samples to non-overlapping frames and frame times."""
+  frame_size = spectrogram_config.hop_width
+  logging.info('Padding %d samples to multiple of %d', len(samples), frame_size)
+  samples = np.pad(samples,
+                   [0, frame_size - len(samples) % frame_size],
+                   mode='constant')
+  frames = spectrograms.split_audio(samples, spectrogram_config)
+  num_frames = len(samples) // frame_size
+  logging.info('Encoded %d samples to %d frames (%d samples each)',
+               len(samples), num_frames, frame_size)
+  times = np.arange(num_frames) / spectrogram_config.frames_per_second
+  return frames, times
+def _include_inputs(ds, input_record, fields_to_omit=('audio',)):
+  """Include fields from input record (other than audio) in dataset records."""
+  def include_inputs_fn(output_record):
+    for key in set(input_record.keys()) - set(output_record.keys()):
+      output_record[key] = input_record[key]
+    for key in fields_to_omit:
+      del output_record[key]
+    return output_record
+  return ds.map(include_inputs_fn,
+                num_parallel_calls=tf.data.experimental.AUTOTUNE)
+def tokenize_transcription_example(
+    ds: tf.data.Dataset, spectrogram_config: spectrograms.SpectrogramConfig,
+    codec: event_codec.Codec, is_training_data: bool,
+    onsets_only: bool, include_ties: bool, audio_is_samples: bool,
+    id_feature_key: Optional[str] = None
+) -> tf.data.Dataset:
+  """Tokenize a note transcription example for run-length encoding.
+  Outputs include:
+    inputs: audio sample frames, num_frames-by-frame_size
+    input_time: timestamp for each frame
+    targets: symbolic sequence of note-related events
+    input_event_start_indices: start target index for every input index
+    input_event_end_indices: end target index for every input index
+  Args:
+    ds: Input dataset.
+    spectrogram_config: Spectrogram configuration.
+    codec: Event vocabulary codec.
+    is_training_data: Unused.
+    onsets_only: If True, include only onset events (not offset, velocity, or
+        program).
+    include_ties: If True, also write state events containing active notes to
+        support a "tie" section after run-length encoding.
+    audio_is_samples: If True, audio is floating-point samples instead of
+        serialized WAV.
+    id_feature_key: If not None, replace sequence ID with specified key field
+        from the dataset.
+  Returns:
+    Dataset with the outputs described above.
+  """
+  del is_training_data
+  if onsets_only and include_ties:
+    raise ValueError('Ties not supported when only modeling onsets.')
+  def tokenize(sequence, audio, sample_rate, example_id=None):
+    ns = note_seq.NoteSequence.FromString(sequence)
+    note_sequences.validate_note_sequence(ns)
+    if example_id is not None:
+      ns.id = example_id
+    if audio_is_samples:
+      samples = audio
+      if sample_rate != spectrogram_config.sample_rate:
+        samples = librosa.resample(
+            samples, sample_rate, spectrogram_config.sample_rate)
+    else:
+      samples = note_seq.audio_io.wav_data_to_samples_librosa(
+          audio, sample_rate=spectrogram_config.sample_rate)
+    logging.info('Got samples for %s::%s with length %d',
+                 ns.id, ns.filename, len(samples))
+    frames, frame_times = _audio_to_frames(samples, spectrogram_config)
+    if onsets_only:
+      times, values = note_sequences.note_sequence_to_onsets(ns)
+    else:
+      ns = note_seq.apply_sustain_control_changes(ns)
+      times, values = (
+          note_sequences.note_sequence_to_onsets_and_offsets_and_programs(ns))
+    # The original NoteSequence can have a lot of control changes we don't need;
+    # delete them.
+    del ns.control_changes[:]
+    (events, event_start_indices, event_end_indices,
+     state_events, state_event_indices) = (
+         run_length_encoding.encode_and_index_events(
+             state=note_sequences.NoteEncodingState() if include_ties else None,
+             event_times=times,
+             event_values=values,
+             encode_event_fn=note_sequences.note_event_data_to_events,
+             codec=codec,
+             frame_times=frame_times,
+             encoding_state_to_events_fn=(
+                 note_sequences.note_encoding_state_to_events
+                 if include_ties else None)))
+    yield {
+        'inputs': frames,
+        'input_times': frame_times,
+        'targets': events,
+        'input_event_start_indices': event_start_indices,
+        'input_event_end_indices': event_end_indices,
+        'state_events': state_events,
+        'input_state_event_indices': state_event_indices,
+        'sequence': ns.SerializeToString()
+    }
+  def process_record(input_record):
+    if audio_is_samples and 'sample_rate' not in input_record:
+      raise ValueError('Must provide sample rate when audio is samples.')
+    args = [
+        input_record['sequence'],
+        input_record['audio'],
+        input_record['sample_rate'] if 'sample_rate' in input_record else 0
+    ]
+    if id_feature_key is not None:
+      args.append(input_record[id_feature_key])
+    ds = tf.data.Dataset.from_generator(
+        tokenize,
+        output_signature={
+            'inputs':
+                tf.TensorSpec(
+                    shape=(None, spectrogram_config.hop_width),
+                    dtype=tf.float32),
+            'input_times':
+                tf.TensorSpec(shape=(None,), dtype=tf.float32),
+            'targets':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_event_start_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_event_end_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'state_events':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_state_event_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'sequence':
+                tf.TensorSpec(shape=(), dtype=tf.string)
+        },
+        args=args)
+    ds = _include_inputs(ds, input_record)
+    return ds
+  tokenized_records = ds.flat_map(process_record)
+  return tokenized_records
+def tokenize_guitarset_example(
+    ds: tf.data.Dataset, spectrogram_config: spectrograms.SpectrogramConfig,
+    codec: event_codec.Codec, is_training_data: bool,
+    onsets_only: bool, include_ties: bool
+) -> tf.data.Dataset:
+  """Tokenize a GuitarSet transcription example."""
+  def _preprocess_example(ex, name):
+    assert 'inst_names' not in ex, 'Key `inst_names` is already populated.'
+    ex['inst_names'] = [name]
+    ex['instrument_sequences'] = [ex.pop('sequence')]
+    return ex
+  ds = ds.map(
+      lambda x: _preprocess_example(x, 'Clean Guitar'),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  ds = tokenize_example_with_program_lookup(
+      ds,
+      spectrogram_config=spectrogram_config,
+      codec=codec,
+      is_training_data=is_training_data,
+      inst_name_to_program_fn=guitarset_instrument_to_program,
+      onsets_only=onsets_only,
+      include_ties=include_ties,
+      id_feature_key='id')
+  return ds
+def guitarset_instrument_to_program(instrument: str) -> int:
+  """GuitarSet is all guitar, return the first MIDI guitar program."""
+  if instrument == 'Clean Guitar':
+    return 24
+  else:
+    raise ValueError('Unknown GuitarSet instrument: %s' % instrument)
+def tokenize_example_with_program_lookup(
+    ds: tf.data.Dataset,
+    spectrogram_config: spectrograms.SpectrogramConfig,
+    codec: event_codec.Codec,
+    is_training_data: bool,
+    onsets_only: bool,
+    include_ties: bool,
+    inst_name_to_program_fn: Callable[[str], int],
+    id_feature_key: Optional[str] = None
+) -> tf.data.Dataset:
+  """Tokenize an example, optionally looking up and assigning program numbers.
+  This can be used by any dataset where a mapping function can be used to
+  map from the inst_names feature to a set of program numbers.
+  Args:
+    ds: Input dataset.
+    spectrogram_config: Spectrogram configuration.
+    codec: Event vocabulary codec.
+    is_training_data: Unused.
+    onsets_only: If True, include only onset events (not offset & velocity).
+    include_ties: If True, include tie events.
+    inst_name_to_program_fn: A function used to map the instrument names
+      in the `inst_names` feature of each example to a MIDI program number.
+    id_feature_key: If not None, replace sequence ID with specified key field
+        from the dataset.
+  Returns:
+    Dataset with the outputs described above.
+  """
+  del is_training_data
+  def tokenize(sequences, inst_names, audio, example_id=None):
+    # Add all the notes from the tracks to a single NoteSequence.
+    ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    tracks = [note_seq.NoteSequence.FromString(seq) for seq in sequences]
+    assert len(tracks) == len(inst_names)
+    for track, inst_name in zip(tracks, inst_names):
+      program = inst_name_to_program_fn(
+          inst_name.decode())
+      # Note that there are no pitch bends in URMP data; the below block will
+      # raise PitchBendError if one is encountered.
+      add_track_to_notesequence(ns, track, program=program, is_drum=False,
+                                ignore_pitch_bends=False)
+    note_sequences.assign_instruments(ns)
+    note_sequences.validate_note_sequence(ns)
+    if example_id is not None:
+      ns.id = example_id
+    samples = note_seq.audio_io.wav_data_to_samples_librosa(
+        audio, sample_rate=spectrogram_config.sample_rate)
+    logging.info('Got samples for %s::%s with length %d',
+                 ns.id, ns.filename, len(samples))
+    frames, frame_times = _audio_to_frames(samples, spectrogram_config)
+    if onsets_only:
+      times, values = note_sequences.note_sequence_to_onsets(ns)
+    else:
+      times, values = (
+          note_sequences.note_sequence_to_onsets_and_offsets_and_programs(ns))
+    # The original NoteSequence can have a lot of control changes we don't need;
+    # delete them.
+    del ns.control_changes[:]
+    (events, event_start_indices, event_end_indices,
+     state_events, state_event_indices) = (
+         run_length_encoding.encode_and_index_events(
+             state=note_sequences.NoteEncodingState() if include_ties else None,
+             event_times=times,
+             event_values=values,
+             encode_event_fn=note_sequences.note_event_data_to_events,
+             codec=codec,
+             frame_times=frame_times,
+             encoding_state_to_events_fn=(
+                 note_sequences.note_encoding_state_to_events
+                 if include_ties else None)))
+    yield {
+        'inputs': frames,
+        'input_times': frame_times,
+        'targets': events,
+        'input_event_start_indices': event_start_indices,
+        'input_event_end_indices': event_end_indices,
+        'state_events': state_events,
+        'input_state_event_indices': state_event_indices,
+        'sequence': ns.SerializeToString()
+    }
+  def process_record(input_record):
+    args = [
+        input_record['instrument_sequences'],
+        input_record['inst_names'],
+        input_record['audio'],
+    ]
+    if id_feature_key is not None:
+      args.append(input_record[id_feature_key])
+    ds = tf.data.Dataset.from_generator(
+        tokenize,
+        output_signature={
+            'inputs':
+                tf.TensorSpec(
+                    shape=(None, spectrogram_config.hop_width),
+                    dtype=tf.float32),
+            'input_times':
+                tf.TensorSpec(shape=(None,), dtype=tf.float32),
+            'targets':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_event_start_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_event_end_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'state_events':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_state_event_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'sequence':
+                tf.TensorSpec(shape=(), dtype=tf.string)
+        },
+        args=args)
+    ds = _include_inputs(ds, input_record)
+    return ds
+  tokenized_records = ds.flat_map(process_record)
+  return tokenized_records
+_URMP_INSTRUMENT_PROGRAMS = immutabledict({
+    'vn': 40,   # violin
+    'va': 41,   # viola
+    'vc': 42,   # cello
+    'db': 43,   # double bass
+    'tpt': 56,  # trumpet
+    'tbn': 57,  # trombone
+    'tba': 58,  # tuba
+    'hn': 60,   # French horn
+    'sax': 64,  # saxophone
+    'ob': 68,   # oboe
+    'bn': 70,   # bassoon
+    'cl': 71,   # clarinet
+    'fl': 73    # flute
+})
+def urmp_instrument_to_program(urmp_instrument: str) -> int:
+  """Fetch the program number associated with a given URMP instrument code."""
+  if urmp_instrument not in _URMP_INSTRUMENT_PROGRAMS:
+    raise ValueError('unknown URMP instrument: %s' % urmp_instrument)
+  return _URMP_INSTRUMENT_PROGRAMS[urmp_instrument]
+_SLAKH_CLASS_PROGRAMS = immutabledict({
+    'Acoustic Piano': 0,
+    'Electric Piano': 4,
+    'Chromatic Percussion': 8,
+    'Organ': 16,
+    'Acoustic Guitar': 24,
+    'Clean Electric Guitar': 26,
+    'Distorted Electric Guitar': 29,
+    'Acoustic Bass': 32,
+    'Electric Bass': 33,
+    'Violin': 40,
+    'Viola': 41,
+    'Cello': 42,
+    'Contrabass': 43,
+    'Orchestral Harp': 46,
+    'Timpani': 47,
+    'String Ensemble': 48,
+    'Synth Strings': 50,
+    'Choir and Voice': 52,
+    'Orchestral Hit': 55,
+    'Trumpet': 56,
+    'Trombone': 57,
+    'Tuba': 58,
+    'French Horn': 60,
+    'Brass Section': 61,
+    'Soprano/Alto Sax': 64,
+    'Tenor Sax': 66,
+    'Baritone Sax': 67,
+    'Oboe': 68,
+    'English Horn': 69,
+    'Bassoon': 70,
+    'Clarinet': 71,
+    'Pipe': 73,
+    'Synth Lead': 80,
+    'Synth Pad': 88
+})
+def slakh_class_to_program_and_is_drum(slakh_class: str) -> Tuple[int, bool]:
+  """Map Slakh class string to program number and boolean indicating drums."""
+  if slakh_class == 'Drums':
+    return 0, True
+  elif slakh_class not in _SLAKH_CLASS_PROGRAMS:
+    raise ValueError('unknown Slakh class: %s' % slakh_class)
+  else:
+    return _SLAKH_CLASS_PROGRAMS[slakh_class], False
+class PitchBendError(Exception):
+  pass
+def add_track_to_notesequence(ns: note_seq.NoteSequence,
+                              track: note_seq.NoteSequence,
+                              program: int, is_drum: bool,
+                              ignore_pitch_bends: bool):
+  """Add a track to a NoteSequence."""
+  if track.pitch_bends and not ignore_pitch_bends:
+    raise PitchBendError
+  track_sus = note_seq.apply_sustain_control_changes(track)
+  for note in track_sus.notes:
+    note.program = program
+    note.is_drum = is_drum
+    ns.notes.extend([note])
+    ns.total_time = max(ns.total_time, note.end_time)
+def tokenize_slakh_example(
+    ds: tf.data.Dataset,
+    spectrogram_config: spectrograms.SpectrogramConfig,
+    codec: event_codec.Codec,
+    is_training_data: bool,
+    onsets_only: bool,
+    include_ties: bool,
+    track_specs: Optional[Sequence[note_sequences.TrackSpec]],
+    ignore_pitch_bends: bool
+) -> tf.data.Dataset:
+  """Tokenize a Slakh multitrack note transcription example."""
+  def tokenize(sequences, samples, sample_rate, inst_names, example_id):
+    if sample_rate != spectrogram_config.sample_rate:
+      samples = librosa.resample(
+          samples, sample_rate, spectrogram_config.sample_rate)
+    frames, frame_times = _audio_to_frames(samples, spectrogram_config)
+    # Add all the notes from the tracks to a single NoteSequence.
+    ns = note_seq.NoteSequence(ticks_per_quarter=220)
+    tracks = [note_seq.NoteSequence.FromString(seq) for seq in sequences]
+    assert len(tracks) == len(inst_names)
+    if track_specs:
+      # Specific tracks expected.
+      assert len(tracks) == len(track_specs)
+      for track, spec, inst_name in zip(tracks, track_specs, inst_names):
+        # Make sure the instrument name matches what we expect.
+        assert inst_name.decode() == spec.name
+        try:
+          add_track_to_notesequence(ns, track,
+                                    program=spec.program, is_drum=spec.is_drum,
+                                    ignore_pitch_bends=ignore_pitch_bends)
+        except PitchBendError:
+          # TODO(iansimon): is there a way to count these?
+          return
+    else:
+      for track, inst_name in zip(tracks, inst_names):
+        # Instrument name should be Slakh class.
+        program, is_drum = slakh_class_to_program_and_is_drum(
+            inst_name.decode())
+        try:
+          add_track_to_notesequence(ns, track, program=program, is_drum=is_drum,
+                                    ignore_pitch_bends=ignore_pitch_bends)
+        except PitchBendError:
+          # TODO(iansimon): is there a way to count these?
+          return
+    note_sequences.assign_instruments(ns)
+    note_sequences.validate_note_sequence(ns)
+    if is_training_data:
+      # Trim overlapping notes in training (as our event vocabulary cannot
+      # represent them), but preserve original NoteSequence for eval.
+      ns = note_sequences.trim_overlapping_notes(ns)
+    ns.id = example_id
+    if onsets_only:
+      times, values = note_sequences.note_sequence_to_onsets(ns)
+    else:
+      times, values = (
+          note_sequences.note_sequence_to_onsets_and_offsets_and_programs(ns))
+    (events, event_start_indices, event_end_indices,
+     state_events, state_event_indices) = (
+         run_length_encoding.encode_and_index_events(
+             state=note_sequences.NoteEncodingState() if include_ties else None,
+             event_times=times,
+             event_values=values,
+             encode_event_fn=note_sequences.note_event_data_to_events,
+             codec=codec,
+             frame_times=frame_times,
+             encoding_state_to_events_fn=(
+                 note_sequences.note_encoding_state_to_events
+                 if include_ties else None)))
+    yield {
+        'inputs': frames,
+        'input_times': frame_times,
+        'targets': events,
+        'input_event_start_indices': event_start_indices,
+        'input_event_end_indices': event_end_indices,
+        'state_events': state_events,
+        'input_state_event_indices': state_event_indices,
+        'sequence': ns.SerializeToString()
+    }
+  def process_record(input_record):
+    ds = tf.data.Dataset.from_generator(
+        tokenize,
+        output_signature={
+            'inputs':
+                tf.TensorSpec(
+                    shape=(None, spectrogram_config.hop_width),
+                    dtype=tf.float32),
+            'input_times':
+                tf.TensorSpec(shape=(None,), dtype=tf.float32),
+            'targets':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_event_start_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_event_end_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'state_events':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'input_state_event_indices':
+                tf.TensorSpec(shape=(None,), dtype=tf.int32),
+            'sequence':
+                tf.TensorSpec(shape=(), dtype=tf.string)
+        },
+        args=[
+            input_record['note_sequences'], input_record['mix'],
+            input_record['audio_sample_rate'], input_record['inst_names'],
+            input_record['track_id']
+        ])
+    ds = _include_inputs(ds, input_record, fields_to_omit=['mix', 'stems'])
+    return ds
+  tokenized_records = ds.flat_map(process_record)
+  return tokenized_records
+@seqio.map_over_dataset
+def compute_spectrograms(ex, spectrogram_config):
+  samples = spectrograms.flatten_frames(ex['inputs'])
+  ex['inputs'] = spectrograms.compute_spectrogram(samples, spectrogram_config)
+  ex['raw_inputs'] = samples
+  return ex
+def handle_too_long(dataset: tf.data.Dataset,
+                    output_features: seqio.preprocessors.OutputFeaturesType,
+                    sequence_length: seqio.preprocessors.SequenceLengthType,
+                    skip: bool = False) -> tf.data.Dataset:
+  """Handle sequences that are too long, by either failing or skipping them."""
+  def max_length_for_key(key):
+    max_length = sequence_length[key]
+    if output_features[key].add_eos:
+      max_length -= 1
+    return max_length
+  if skip:
+    # Drop examples where one of the features is longer than its maximum
+    # sequence length.
+    def is_not_too_long(ex):
+      return not tf.reduce_any(
+          [k in output_features and len(v) > max_length_for_key(k)
+           for k, v in ex.items()])
+    dataset = dataset.filter(is_not_too_long)
+  def assert_not_too_long(key: str, value: tf.Tensor) -> tf.Tensor:
+    if key in output_features:
+      max_length = max_length_for_key(key)
+      tf.debugging.assert_less_equal(
+          tf.shape(value)[0], max_length,
+          f'Value for "{key}" field exceeds maximum length')
+    return value
+  # Assert that no examples have features longer than their maximum sequence
+  # length.
+  return dataset.map(
+      lambda ex: {k: assert_not_too_long(k, v) for k, v in ex.items()},
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+@gin.configurable
+def map_midi_programs(
+    ds: tf.data.Dataset,
+    codec: event_codec.Codec,
+    granularity_type: str = 'full',
+    feature_key: str = 'targets'
+) -> Mapping[str, Any]:
+  """Apply MIDI program map to token sequences."""
+  granularity = vocabularies.PROGRAM_GRANULARITIES[granularity_type]
+  def _map_program_tokens(ex):
+    ex[feature_key] = granularity.tokens_map_fn(ex[feature_key], codec)
+    return ex
+  return ds.map(_map_program_tokens,
+                num_parallel_calls=tf.data.experimental.AUTOTUNE)

mt3/pytest.ini ADDED Viewed

	@@ -0,0 +1,3 @@

+[pytest]
+python_files = *_test.py
+log_level = INFO

mt3/run_length_encoding.py ADDED Viewed

	@@ -0,0 +1,423 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tools for run length encoding."""
+import dataclasses
+from typing import Any, Callable, Mapping, MutableMapping, Tuple, Optional, Sequence, TypeVar
+from absl import logging
+from mt3 import event_codec
+import numpy as np
+import seqio
+import tensorflow as tf
+Event = event_codec.Event
+# These should be type variables, but unfortunately those are incompatible with
+# dataclasses.
+EventData = Any
+EncodingState = Any
+DecodingState = Any
+DecodeResult = Any
+T = TypeVar('T', bound=EventData)
+ES = TypeVar('ES', bound=EncodingState)
+DS = TypeVar('DS', bound=DecodingState)
+@dataclasses.dataclass
+class EventEncodingSpec:
+  """Spec for encoding events."""
+  # initialize encoding state
+  init_encoding_state_fn: Callable[[], EncodingState]
+  # convert EventData into zero or more events, updating encoding state
+  encode_event_fn: Callable[[EncodingState, EventData, event_codec.Codec],
+                            Sequence[event_codec.Event]]
+  # convert encoding state (at beginning of segment) into events
+  encoding_state_to_events_fn: Optional[Callable[[EncodingState],
+                                                 Sequence[event_codec.Event]]]
+  # create empty decoding state
+  init_decoding_state_fn: Callable[[], DecodingState]
+  # update decoding state when entering new segment
+  begin_decoding_segment_fn: Callable[[DecodingState], None]
+  # consume time and Event and update decoding state
+  decode_event_fn: Callable[
+      [DecodingState, float, event_codec.Event, event_codec.Codec], None]
+  # flush decoding state into result
+  flush_decoding_state_fn: Callable[[DecodingState], DecodeResult]
+def encode_and_index_events(
+    state: ES,
+    event_times: Sequence[float],
+    event_values: Sequence[T],
+    encode_event_fn: Callable[[ES, T, event_codec.Codec],
+                              Sequence[event_codec.Event]],
+    codec: event_codec.Codec,
+    frame_times: Sequence[float],
+    encoding_state_to_events_fn: Optional[
+        Callable[[ES], Sequence[event_codec.Event]]] = None,
+) -> Tuple[Sequence[int], Sequence[int], Sequence[int],
+           Sequence[int], Sequence[int]]:
+  """Encode a sequence of timed events and index to audio frame times.
+  Encodes time shifts as repeated single step shifts for later run length
+  encoding.
+  Optionally, also encodes a sequence of "state events", keeping track of the
+  current encoding state at each audio frame. This can be used e.g. to prepend
+  events representing the current state to a targets segment.
+  Args:
+    state: Initial event encoding state.
+    event_times: Sequence of event times.
+    event_values: Sequence of event values.
+    encode_event_fn: Function that transforms event value into a sequence of one
+        or more event_codec.Event objects.
+    codec: An event_codec.Codec object that maps Event objects to indices.
+    frame_times: Time for every audio frame.
+    encoding_state_to_events_fn: Function that transforms encoding state into a
+        sequence of one or more event_codec.Event objects.
+  Returns:
+    events: Encoded events and shifts.
+    event_start_indices: Corresponding start event index for every audio frame.
+        Note: one event can correspond to multiple audio indices due to sampling
+        rate differences. This makes splitting sequences tricky because the same
+        event can appear at the end of one sequence and the beginning of
+        another.
+    event_end_indices: Corresponding end event index for every audio frame. Used
+        to ensure when slicing that one chunk ends where the next begins. Should
+        always be true that event_end_indices[i] = event_start_indices[i + 1].
+    state_events: Encoded "state" events representing the encoding state before
+        each event.
+    state_event_indices: Corresponding state event index for every audio frame.
+  """
+  indices = np.argsort(event_times, kind='stable')
+  event_steps = [round(event_times[i] * codec.steps_per_second)
+                 for i in indices]
+  event_values = [event_values[i] for i in indices]
+  events = []
+  state_events = []
+  event_start_indices = []
+  state_event_indices = []
+  cur_step = 0
+  cur_event_idx = 0
+  cur_state_event_idx = 0
+  def fill_event_start_indices_to_cur_step():
+    while(len(event_start_indices) < len(frame_times) and
+          frame_times[len(event_start_indices)] <
+          cur_step / codec.steps_per_second):
+      event_start_indices.append(cur_event_idx)
+      state_event_indices.append(cur_state_event_idx)
+  for event_step, event_value in zip(event_steps, event_values):
+    while event_step > cur_step:
+      events.append(codec.encode_event(Event(type='shift', value=1)))
+      cur_step += 1
+      fill_event_start_indices_to_cur_step()
+      cur_event_idx = len(events)
+      cur_state_event_idx = len(state_events)
+    if encoding_state_to_events_fn:
+      # Dump state to state events *before* processing the next event, because
+      # we want to capture the state prior to the occurrence of the event.
+      for e in encoding_state_to_events_fn(state):
+        state_events.append(codec.encode_event(e))
+    for e in encode_event_fn(state, event_value, codec):
+      events.append(codec.encode_event(e))
+  # After the last event, continue filling out the event_start_indices array.
+  # The inequality is not strict because if our current step lines up exactly
+  # with (the start of) an audio frame, we need to add an additional shift event
+  # to "cover" that frame.
+  while cur_step / codec.steps_per_second <= frame_times[-1]:
+    events.append(codec.encode_event(Event(type='shift', value=1)))
+    cur_step += 1
+    fill_event_start_indices_to_cur_step()
+    cur_event_idx = len(events)
+  # Now fill in event_end_indices. We need this extra array to make sure that
+  # when we slice events, each slice ends exactly where the subsequent slice
+  # begins.
+  event_end_indices = event_start_indices[1:] + [len(events)]
+  events = np.array(events)
+  state_events = np.array(state_events)
+  event_start_indices = np.array(event_start_indices)
+  event_end_indices = np.array(event_end_indices)
+  state_event_indices = np.array(state_event_indices)
+  return (events, event_start_indices, event_end_indices,
+          state_events, state_event_indices)
+@seqio.map_over_dataset
+def extract_target_sequence_with_indices(features, state_events_end_token=None):
+  """Extract target sequence corresponding to audio token segment."""
+  target_start_idx = features['input_event_start_indices'][0]
+  target_end_idx = features['input_event_end_indices'][-1]
+  features['targets'] = features['targets'][target_start_idx:target_end_idx]
+  if state_events_end_token is not None:
+    # Extract the state events corresponding to the audio start token, and
+    # prepend them to the targets array.
+    state_event_start_idx = features['input_state_event_indices'][0]
+    state_event_end_idx = state_event_start_idx + 1
+    while features['state_events'][
+        state_event_end_idx - 1] != state_events_end_token:
+      state_event_end_idx += 1
+    features['targets'] = tf.concat([
+        features['state_events'][state_event_start_idx:state_event_end_idx],
+        features['targets']
+    ], axis=0)
+  return features
+def remove_redundant_state_changes_fn(
+    codec: event_codec.Codec,
+    feature_key: str = 'targets',
+    state_change_event_types: Sequence[str] = ()
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+  """Return preprocessing function that removes redundant state change events.
+  Args:
+    codec: The event_codec.Codec used to interpret the events.
+    feature_key: The feature key for which to remove redundant state changes.
+    state_change_event_types: A list of event types that represent state
+        changes; tokens corresponding to these event types will be interpreted
+        as state changes and redundant ones will be removed.
+  Returns:
+    A preprocessing function that removes redundant state change events.
+  """
+  state_change_event_ranges = [codec.event_type_range(event_type)
+                               for event_type in state_change_event_types]
+  def remove_redundant_state_changes(
+      features: MutableMapping[str, Any],
+  ) -> Mapping[str, Any]:
+    """Remove redundant tokens e.g. duplicate velocity changes from sequence."""
+    current_state = tf.zeros(len(state_change_event_ranges), dtype=tf.int32)
+    output = tf.constant([], dtype=tf.int32)
+    for event in features[feature_key]:
+      # Let autograph know that the shape of 'output' will change during the
+      # loop.
+      tf.autograph.experimental.set_loop_options(
+          shape_invariants=[(output, tf.TensorShape([None]))])
+      is_redundant = False
+      for i, (min_index, max_index) in enumerate(state_change_event_ranges):
+        if (min_index <= event) and (event <= max_index):
+          if current_state[i] == event:
+            is_redundant = True
+          current_state = tf.tensor_scatter_nd_update(
+              current_state, indices=[[i]], updates=[event])
+      if not is_redundant:
+        output = tf.concat([output, [event]], axis=0)
+    features[feature_key] = output
+    return features
+  return seqio.map_over_dataset(remove_redundant_state_changes)
+def run_length_encode_shifts_fn(
+    codec: event_codec.Codec,
+    feature_key: str = 'targets'
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+  """Return a function that run-length encodes shifts for a given codec.
+  Args:
+    codec: The Codec to use for shift events.
+    feature_key: The feature key for which to run-length encode shifts.
+  Returns:
+    A preprocessing function that run-length encodes single-step shifts.
+  """
+  def run_length_encode_shifts(
+      features: MutableMapping[str, Any]
+  ) -> Mapping[str, Any]:
+    """Combine leading/interior shifts, trim trailing shifts.
+    Args:
+      features: Dict of features to process.
+    Returns:
+      A dict of features.
+    """
+    events = features[feature_key]
+    shift_steps = 0
+    total_shift_steps = 0
+    output = tf.constant([], dtype=tf.int32)
+    for event in events:
+      # Let autograph know that the shape of 'output' will change during the
+      # loop.
+      tf.autograph.experimental.set_loop_options(
+          shape_invariants=[(output, tf.TensorShape([None]))])
+      if codec.is_shift_event_index(event):
+        shift_steps += 1
+        total_shift_steps += 1
+      else:
+        # Once we've reached a non-shift event, RLE all previous shift events
+        # before outputting the non-shift event.
+        if shift_steps > 0:
+          shift_steps = total_shift_steps
+          while shift_steps > 0:
+            output_steps = tf.minimum(codec.max_shift_steps, shift_steps)
+            output = tf.concat([output, [output_steps]], axis=0)
+            shift_steps -= output_steps
+        output = tf.concat([output, [event]], axis=0)
+    features[feature_key] = output
+    return features
+  return seqio.map_over_dataset(run_length_encode_shifts)
+def merge_run_length_encoded_targets(
+    targets: np.ndarray,
+    codec: event_codec.Codec
+) -> Sequence[int]:
+  """Merge multiple tracks of target events into a single stream.
+  Args:
+    targets: A 2D array (# tracks by # events) of integer event values.
+    codec: The event_codec.Codec used to interpret the events.
+  Returns:
+    A 1D array of merged events.
+  """
+  num_tracks = tf.shape(targets)[0]
+  targets_length = tf.shape(targets)[1]
+  current_step = 0
+  current_offsets = tf.zeros(num_tracks, dtype=tf.int32)
+  output = tf.constant([], dtype=tf.int32)
+  done = tf.constant(False)
+  while not done:
+    # Let autograph know that the shape of 'output' will change during the loop.
+    tf.autograph.experimental.set_loop_options(
+        shape_invariants=[(output, tf.TensorShape([None]))])
+    # Determine which targets track has the earliest next step.
+    next_step = codec.max_shift_steps + 1
+    next_track = -1
+    for i in range(num_tracks):
+      if (current_offsets[i] == targets_length or
+          targets[i][current_offsets[i]] == 0):
+        # Already reached the end of this targets track.
+        # (Zero is technically a valid shift event but we never actually use it;
+        #  it is always padding.)
+        continue
+      if not codec.is_shift_event_index(targets[i][current_offsets[i]]):
+        # The only way we would be at a non-shift event is if we have not yet
+        # reached the first shift event, which means we're at step zero.
+        next_step = 0
+        next_track = i
+      elif targets[i][current_offsets[i]] < next_step:
+        next_step = targets[i][current_offsets[i]]
+        next_track = i
+    if next_track == -1:
+      # We've already merged all of the target tracks in their entirety.
+      done = tf.constant(True)
+      break
+    if next_step == current_step and next_step > 0:
+      # We don't need to include the shift event itself as it's the same step as
+      # the previous shift.
+      start_offset = current_offsets[next_track] + 1
+    else:
+      start_offset = current_offsets[next_track]
+    # Merge in events up to but not including the next shift.
+    end_offset = start_offset + 1
+    while end_offset < targets_length and not codec.is_shift_event_index(
+        targets[next_track][end_offset]):
+      end_offset += 1
+    output = tf.concat(
+        [output, targets[next_track][start_offset:end_offset]], axis=0)
+    current_step = next_step
+    current_offsets = tf.tensor_scatter_nd_update(
+        current_offsets, indices=[[next_track]], updates=[end_offset])
+  return output
+def decode_events(
+    state: DS,
+    tokens: np.ndarray,
+    start_time: int,
+    max_time: Optional[int],
+    codec: event_codec.Codec,
+    decode_event_fn: Callable[[DS, float, event_codec.Event, event_codec.Codec],
+                              None],
+) -> Tuple[int, int]:
+  """Decode a series of tokens, maintaining a decoding state object.
+  Args:
+    state: Decoding state object; will be modified in-place.
+    tokens: event tokens to convert.
+    start_time: offset start time if decoding in the middle of a sequence.
+    max_time: Events at or beyond this time will be dropped.
+    codec: An event_codec.Codec object that maps indices to Event objects.
+    decode_event_fn: Function that consumes an Event (and the current time) and
+        updates the decoding state.
+  Returns:
+    invalid_events: number of events that could not be decoded.
+    dropped_events: number of events dropped due to max_time restriction.
+  """
+  invalid_events = 0
+  dropped_events = 0
+  cur_steps = 0
+  cur_time = start_time
+  token_idx = 0
+  for token_idx, token in enumerate(tokens):
+    try:
+      event = codec.decode_event_index(token)
+    except ValueError:
+      invalid_events += 1
+      continue
+    if event.type == 'shift':
+      cur_steps += event.value
+      cur_time = start_time + cur_steps / codec.steps_per_second
+      if max_time and cur_time > max_time:
+        dropped_events = len(tokens) - token_idx
+        break
+    else:
+      cur_steps = 0
+      try:
+        decode_event_fn(state, cur_time, event, codec)
+      except ValueError:
+        invalid_events += 1
+        logging.info(
+            'Got invalid event when decoding event %s at time %f. '
+            'Invalid event counter now at %d.',
+            event, cur_time, invalid_events, exc_info=True)
+        continue
+  return invalid_events, dropped_events

mt3/run_length_encoding_test.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for run_length_encoding."""
+from mt3 import event_codec
+from mt3 import run_length_encoding
+import note_seq
+import numpy as np
+import seqio
+import tensorflow as tf
+assert_dataset = seqio.test_utils.assert_dataset
+codec = event_codec.Codec(
+    max_shift_steps=100,
+    steps_per_second=100,
+    event_ranges=[
+        event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                               note_seq.MAX_MIDI_PITCH),
+        event_codec.EventRange('velocity', 0, 127),
+        event_codec.EventRange('drum', note_seq.MIN_MIDI_PITCH,
+                               note_seq.MAX_MIDI_PITCH),
+        event_codec.EventRange('program', note_seq.MIN_MIDI_PROGRAM,
+                               note_seq.MAX_MIDI_PROGRAM),
+        event_codec.EventRange('tie', 0, 0)
+    ])
+run_length_encode_shifts = run_length_encoding.run_length_encode_shifts_fn(
+    codec=codec)
+class RunLengthEncodingTest(tf.test.TestCase):
+  def test_remove_redundant_state_changes(self):
+    og_dataset = tf.data.Dataset.from_tensors({
+        'targets': [3, 525, 356, 161, 2, 525, 356, 161, 355, 394]
+    })
+    assert_dataset(
+        run_length_encoding.remove_redundant_state_changes_fn(
+            codec=codec,
+            state_change_event_types=['velocity', 'program'])(og_dataset),
+        {
+            'targets': [3, 525, 356, 161, 2, 161, 355, 394],
+        })
+  def test_run_length_encode_shifts(self):
+    og_dataset = tf.data.Dataset.from_tensors({
+        'targets': [1, 1, 1, 161, 1, 1, 1, 162, 1, 1, 1]
+    })
+    assert_dataset(
+        run_length_encode_shifts(og_dataset),
+        {
+            'targets': [3, 161, 6, 162],
+        })
+  def test_run_length_encode_shifts_beyond_max_length(self):
+    og_dataset = tf.data.Dataset.from_tensors({
+        'targets': [1] * 202 + [161, 1, 1, 1]
+    })
+    assert_dataset(
+        run_length_encode_shifts(og_dataset),
+        {
+            'targets': [100, 100, 2, 161],
+        })
+  def test_run_length_encode_shifts_simultaneous(self):
+    og_dataset = tf.data.Dataset.from_tensors({
+        'targets': [1, 1, 1, 161, 162, 1, 1, 1]
+    })
+    assert_dataset(
+        run_length_encode_shifts(og_dataset),
+        {
+            'targets': [3, 161, 162],
+        })
+  def test_merge_run_length_encoded_targets(self):
+    # pylint: disable=bad-whitespace
+    targets = np.array([
+        [  3, 161, 162,   5, 163],
+        [160, 164,   3, 165,   0]
+    ])
+    # pylint: enable=bad-whitespace
+    merged_targets = run_length_encoding.merge_run_length_encoded_targets(
+        targets=targets, codec=codec)
+    expected_merged_targets = [
+        160, 164, 3, 161, 162, 165, 5, 163
+    ]
+    np.testing.assert_array_equal(expected_merged_targets, merged_targets)
+if __name__ == '__main__':
+  tf.test.main()

mt3/scripts/dump_task.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple debugging utility for printing out task contents."""
+import re
+from absl import app
+from absl import flags
+import mt3.tasks  # pylint: disable=unused-import
+import seqio
+import tensorflow as tf
+FLAGS = flags.FLAGS
+flags.DEFINE_string("task", None, "A registered Task.")
+flags.DEFINE_string("task_cache_dir", None, "Directory to use for task cache.")
+flags.DEFINE_integer("max_examples", 10,
+                     "Maximum number of examples (-1 for no limit).")
+flags.DEFINE_string("format_string", "targets = {targets}",
+                    "Format for printing examples.")
+flags.DEFINE_string("split", "train",
+                    "Which split of the dataset, e.g. train or validation.")
+flags.DEFINE_integer("sequence_length_inputs", 256,
+                     "Sequence length for inputs.")
+flags.DEFINE_integer("sequence_length_targets", 1024,
+                     "Sequence length for targets.")
+def main(_):
+  if FLAGS.task_cache_dir:
+    seqio.add_global_cache_dirs([FLAGS.task_cache_dir])
+  task = seqio.get_mixture_or_task(FLAGS.task)
+  ds = task.get_dataset(
+      sequence_length={
+          "inputs": FLAGS.sequence_length_inputs,
+          "targets": FLAGS.sequence_length_targets,
+      },
+      split=FLAGS.split,
+      use_cached=bool(FLAGS.task_cache_dir),
+      shuffle=False)
+  keys = re.findall(r"{([\w+]+)}", FLAGS.format_string)
+  def _example_to_string(ex):
+    key_to_string = {}
+    for k in keys:
+      if k in ex:
+        v = ex[k].numpy().tolist()
+        key_to_string[k] = task.output_features[k].vocabulary.decode(v)
+      else:
+        key_to_string[k] = ""
+    return FLAGS.format_string.format(**key_to_string)
+  for ex in ds.take(FLAGS.max_examples):
+    for k, v in ex.items():
+      print(f"{k}: {tf.shape(v)}")
+    print(_example_to_string(ex))
+    print()
+if __name__ == "__main__":
+  flags.mark_flags_as_required(["task"])
+  app.run(main)

mt3/scripts/extract_monophonic_examples.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Detect monophonic tracks and extract notes."""
+import collections
+import os
+from absl import app
+from absl import flags
+from absl import logging
+import ddsp
+import librosa
+import note_seq
+import numpy as np
+import scipy
+import tensorflow as tf
+_INPUT_DIR = flags.DEFINE_string(
+    'input_dir', None,
+    'Input directory containing WAV files.')
+_OUTPUT_TFRECORD_PATH = flags.DEFINE_string(
+    'output_tfrecord_path', None,
+    'Path to the output TFRecord containing tf.train.Example protos with '
+    'monophonic tracks and inferred NoteSequence protos.')
+CREPE_SAMPLE_RATE = 16000
+CREPE_FRAME_RATE = 100
+MONOPHONIC_CONFIDENCE_THRESHOLD = 0.95  # confidence must be greater than this
+MONOPHONIC_CONFIDENCE_FRAC = 0.2        # for this fraction of frames
+# split input audio into clips
+CLIP_LENGTH_SECONDS = 5
+def is_monophonic_heuristic(f0_confidence):
+  """Heuristic to check for monophonicity using f0 confidence."""
+  return (np.sum(f0_confidence >= MONOPHONIC_CONFIDENCE_THRESHOLD) /
+          len(f0_confidence) >= MONOPHONIC_CONFIDENCE_FRAC)
+# HMM parameters for modeling notes and F0 tracks.
+F0_MIDI_SIGMA = 0.2
+OCTAVE_ERROR_PROB = 0.05
+NOTES_PER_SECOND = 2
+NOTE_CHANGE_PROB = NOTES_PER_SECOND / CREPE_FRAME_RATE
+F0_CONFIDENCE_EXP = 7.5
+def f0_hmm_matrices(f0_hz, f0_confidence):
+  """Observation and transition matrices for hidden Markov model of F0."""
+  f0_midi = librosa.hz_to_midi(f0_hz)
+  f0_midi_diff = f0_midi[:, np.newaxis] - np.arange(128)[np.newaxis, :]
+  # Compute the probability of each pitch at each frame, taking octave errors
+  # into account.
+  f0_midi_prob_octave_correct = scipy.stats.norm.pdf(
+      f0_midi_diff, scale=F0_MIDI_SIGMA)
+  f0_midi_prob_octave_low = scipy.stats.norm.pdf(
+      f0_midi_diff + 12, scale=F0_MIDI_SIGMA)
+  f0_midi_prob_octave_high = scipy.stats.norm.pdf(
+      f0_midi_diff - 12, scale=F0_MIDI_SIGMA)
+  # distribution of pitch values given note
+  f0_midi_loglik = ((1 - OCTAVE_ERROR_PROB) * f0_midi_prob_octave_correct +
+                    0.5 * OCTAVE_ERROR_PROB * f0_midi_prob_octave_low +
+                    0.5 * OCTAVE_ERROR_PROB * f0_midi_prob_octave_high)
+  # (uniform) distribution of pitch values given rest
+  f0_midi_rest_loglik = -np.log(128)
+  # Here we interpret confidence, after adjusting by exponent, as P(not rest).
+  f0_confidence_prob = np.power(f0_confidence, F0_CONFIDENCE_EXP)[:, np.newaxis]
+  obs_loglik = np.concatenate([
+      # probability of note (normalized by number of possible notes)
+      f0_midi_loglik + np.log(f0_confidence_prob) - np.log(128),
+      # probability of rest
+      f0_midi_rest_loglik + np.log(1.0 - f0_confidence_prob)
+  ], axis=1)
+  # Normalize to adjust P(confidence | note) by uniform P(note).
+  # TODO(iansimon): Not sure how correct this is but it doesn't affect the path.
+  obs_loglik += np.log(129)
+  trans_prob = ((NOTE_CHANGE_PROB / 128) * np.ones(129) +
+                (1 - NOTE_CHANGE_PROB - NOTE_CHANGE_PROB / 128) * np.eye(129))
+  trans_loglik = np.log(trans_prob)
+  return obs_loglik, trans_loglik
+def hmm_forward(obs_loglik, trans_loglik):
+  """Forward algorithm for a hidden Markov model."""
+  n, k = obs_loglik.shape
+  trans = np.exp(trans_loglik)
+  loglik = 0.0
+  l = obs_loglik[0] - np.log(k)
+  c = scipy.special.logsumexp(l)
+  loglik += c
+  for i in range(1, n):
+    p = np.exp(l - c)
+    l = np.log(np.dot(p, trans)) + obs_loglik[i]
+    c = scipy.special.logsumexp(l)
+    loglik += c
+  return loglik
+def hmm_viterbi(obs_loglik, trans_loglik):
+  """Viterbi algorithm for a hidden Markov model."""
+  n, k = obs_loglik.shape
+  loglik_matrix = np.zeros_like(obs_loglik)
+  path_matrix = np.zeros_like(obs_loglik, dtype=np.int32)
+  loglik_matrix[0, :] = obs_loglik[0, :] - np.log(k)
+  for i in range(1, n):
+    mat = np.tile(loglik_matrix[i - 1][:, np.newaxis], [1, 129]) + trans_loglik
+    path_matrix[i, :] = mat.argmax(axis=0)
+    loglik_matrix[i, :] = mat[path_matrix[i, :], range(129)] + obs_loglik[i]
+  path = [np.argmax(loglik_matrix[-1])]
+  for i in range(n, 1, -1):
+    path.append(path_matrix[i - 1, path[-1]])
+  return [(pitch if pitch < 128 else None) for pitch in path[::-1]]
+def pitches_to_notesequence(pitches):
+  """Convert sequence of pitches output by Viterbi to NoteSequence proto."""
+  ns = note_seq.NoteSequence(ticks_per_quarter=220)
+  current_pitch = None
+  start_time = None
+  for frame, pitch in enumerate(pitches):
+    time = frame / CREPE_FRAME_RATE
+    if pitch != current_pitch:
+      if current_pitch is not None:
+        ns.notes.add(
+            pitch=current_pitch, velocity=100,
+            start_time=start_time, end_time=time)
+      current_pitch = pitch
+      start_time = time
+  if current_pitch is not None:
+    ns.notes.add(
+        pitch=current_pitch, velocity=100,
+        start_time=start_time, end_time=len(pitches) / CREPE_FRAME_RATE)
+  if ns.notes:
+    ns.total_time = ns.notes[-1].end_time
+  return ns
+# Per-frame log likelihood threshold below which an F0 track will be discarded.
+# Note that this is dependent on the HMM parameters specified above, so if those
+# change then this threshold should also change.
+PER_FRAME_LOGLIK_THRESHOLD = 0.3
+def extract_note_sequence(crepe, samples, counters):
+  """Use CREPE to attempt to extract a monophonic NoteSequence from audio."""
+  f0_hz, f0_confidence = crepe.predict_f0_and_confidence(
+      samples[np.newaxis, :], viterbi=False)
+  f0_hz = f0_hz[0].numpy()
+  f0_confidence = f0_confidence[0].numpy()
+  if not is_monophonic_heuristic(f0_confidence):
+    counters['not_monophonic'] += 1
+    return None
+  obs_loglik, trans_loglik = f0_hmm_matrices(f0_hz, f0_confidence)
+  loglik = hmm_forward(obs_loglik, trans_loglik)
+  if loglik / len(obs_loglik) < PER_FRAME_LOGLIK_THRESHOLD:
+    counters['low_likelihood'] += 1
+    return None
+  pitches = hmm_viterbi(obs_loglik, trans_loglik)
+  ns = pitches_to_notesequence(pitches)
+  counters['extracted_monophonic_sequence'] += 1
+  return ns
+def process_wav_file(wav_filename, crepe, counters):
+  """Extract monophonic transcription examples from a WAV file."""
+  wav_data = tf.io.gfile.GFile(wav_filename, 'rb').read()
+  samples = note_seq.audio_io.wav_data_to_samples_librosa(
+      wav_data, sample_rate=CREPE_SAMPLE_RATE)
+  clip_length_samples = int(CREPE_SAMPLE_RATE * CLIP_LENGTH_SECONDS)
+  for start_sample in range(0, len(samples), clip_length_samples):
+    clip_samples = samples[start_sample:start_sample + clip_length_samples]
+    if len(clip_samples) < clip_length_samples:
+      clip_samples = np.pad(
+          clip_samples, [(0, clip_length_samples - len(clip_samples))])
+    ns = extract_note_sequence(crepe, clip_samples, counters)
+    if ns:
+      feature = {
+          'audio': tf.train.Feature(
+              float_list=tf.train.FloatList(value=clip_samples.tolist())),
+          'filename': tf.train.Feature(
+              bytes_list=tf.train.BytesList(value=[wav_filename.encode()])),
+          'offset': tf.train.Feature(
+              int64_list=tf.train.Int64List(value=[start_sample])),
+          'sampling_rate': tf.train.Feature(
+              float_list=tf.train.FloatList(value=[CREPE_SAMPLE_RATE])),
+          'sequence': tf.train.Feature(
+              bytes_list=tf.train.BytesList(value=[ns.SerializeToString()]))
+      }
+      yield tf.train.Example(features=tf.train.Features(feature=feature))
+def main(unused_argv):
+  flags.mark_flags_as_required(['input_dir', 'output_tfrecord_path'])
+  crepe = ddsp.spectral_ops.PretrainedCREPE('full')
+  counters = collections.defaultdict(int)
+  with tf.io.TFRecordWriter(_OUTPUT_TFRECORD_PATH.value) as writer:
+    for filename in tf.io.gfile.listdir(_INPUT_DIR.value):
+      if not filename.endswith('.wav'):
+        logging.info('skipping %s...', filename)
+        counters['non_wav_files_skipped'] += 1
+        continue
+      logging.info('processing %s...', filename)
+      for ex in process_wav_file(
+          os.path.join(_INPUT_DIR.value, filename), crepe, counters):
+        writer.write(ex.SerializeToString())
+      counters['wav_files_processed'] += 1
+  for k, v in counters.items():
+    logging.info('COUNTER: %s = %d', k, v)
+if __name__ == '__main__':
+  app.run(main)

mt3/spectrograms.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Audio spectrogram functions."""
+import dataclasses
+from ddsp import spectral_ops
+import tensorflow as tf
+# defaults for spectrogram config
+DEFAULT_SAMPLE_RATE = 16000
+DEFAULT_HOP_WIDTH = 128
+DEFAULT_NUM_MEL_BINS = 512
+# fixed constants; add these to SpectrogramConfig before changing
+FFT_SIZE = 2048
+MEL_LO_HZ = 20.0
+@dataclasses.dataclass
+class SpectrogramConfig:
+  """Spectrogram configuration parameters."""
+  sample_rate: int = DEFAULT_SAMPLE_RATE
+  hop_width: int = DEFAULT_HOP_WIDTH
+  num_mel_bins: int = DEFAULT_NUM_MEL_BINS
+  @property
+  def abbrev_str(self):
+    s = ''
+    if self.sample_rate != DEFAULT_SAMPLE_RATE:
+      s += 'sr%d' % self.sample_rate
+    if self.hop_width != DEFAULT_HOP_WIDTH:
+      s += 'hw%d' % self.hop_width
+    if self.num_mel_bins != DEFAULT_NUM_MEL_BINS:
+      s += 'mb%d' % self.num_mel_bins
+    return s
+  @property
+  def frames_per_second(self):
+    return self.sample_rate / self.hop_width
+def split_audio(samples, spectrogram_config):
+  """Split audio into frames."""
+  return tf.signal.frame(
+      samples,
+      frame_length=spectrogram_config.hop_width,
+      frame_step=spectrogram_config.hop_width,
+      pad_end=True)
+def compute_spectrogram(samples, spectrogram_config):
+  """Compute a mel spectrogram."""
+  overlap = 1 - (spectrogram_config.hop_width / FFT_SIZE)
+  return spectral_ops.compute_logmel(
+      samples,
+      bins=spectrogram_config.num_mel_bins,
+      lo_hz=MEL_LO_HZ,
+      overlap=overlap,
+      fft_size=FFT_SIZE,
+      sample_rate=spectrogram_config.sample_rate)
+def flatten_frames(frames):
+  """Convert frames back into a flat array of samples."""
+  return tf.reshape(frames, [-1])
+def input_depth(spectrogram_config):
+  return spectrogram_config.num_mel_bins

mt3/summaries.py ADDED Viewed

	@@ -0,0 +1,471 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorBoard summaries and utilities."""
+from typing import Any, Mapping, Optional, Sequence, Tuple
+import librosa
+from mt3 import note_sequences
+from mt3 import spectrograms
+import note_seq
+from note_seq import midi_synth
+from note_seq import sequences_lib
+from note_seq.protobuf import music_pb2
+import numpy as np
+import seqio
+_DEFAULT_AUDIO_SECONDS = 30.0
+_DEFAULT_PIANOROLL_FRAMES_PER_SECOND = 15
+# TODO(iansimon): pick a SoundFont; for some reason the default is all organ
+def _extract_example_audio(
+    examples: Sequence[Mapping[str, Any]],
+    sample_rate: float,
+    num_seconds: float,
+    audio_key: str = 'raw_inputs'
+) -> np.ndarray:
+  """Extract audio from examples.
+  Args:
+    examples: List of examples containing raw audio.
+    sample_rate: Number of samples per second.
+    num_seconds: Number of seconds of audio to include.
+    audio_key: Dictionary key for the raw audio.
+  Returns:
+    An n-by-num_samples numpy array of samples.
+  """
+  n = len(examples)
+  num_samples = round(num_seconds * sample_rate)
+  all_samples = np.zeros([n, num_samples])
+  for i, ex in enumerate(examples):
+    samples = ex[audio_key][:num_samples]
+    all_samples[i, :len(samples)] = samples
+  return all_samples
+def _example_to_note_sequence(
+    example: Mapping[str, Sequence[float]],
+    ns_feature_name: str,
+    note_onset_feature_name: str,
+    note_offset_feature_name: str,
+    note_frequency_feature_name: str,
+    note_confidence_feature_name: str,
+    num_seconds: float
+) -> music_pb2.NoteSequence:
+  """Extract NoteSequence from example."""
+  if ns_feature_name:
+    ns = example[ns_feature_name]
+  else:
+    onset_times = np.array(example[note_onset_feature_name])
+    pitches = librosa.hz_to_midi(
+        example[note_frequency_feature_name]).round().astype(int)
+    assert len(onset_times) == len(pitches)
+    if note_offset_feature_name or note_confidence_feature_name:
+      offset_times = (
+          example[note_offset_feature_name]
+          if note_offset_feature_name
+          else onset_times + note_sequences.DEFAULT_NOTE_DURATION
+      )
+      assert len(onset_times) == len(offset_times)
+      confidences = (np.array(example[note_confidence_feature_name])
+                     if note_confidence_feature_name else None)
+      velocities = np.ceil(
+          note_seq.MAX_MIDI_VELOCITY * confidences if confidences is not None
+          else note_sequences.DEFAULT_VELOCITY * np.ones_like(onset_times)
+      ).astype(int)
+      assert len(onset_times) == len(velocities)
+      ns = note_sequences.note_arrays_to_note_sequence(
+          onset_times=onset_times, offset_times=offset_times,
+          pitches=pitches, velocities=velocities)
+    else:
+      ns = note_sequences.note_arrays_to_note_sequence(
+          onset_times=onset_times, pitches=pitches)
+  return sequences_lib.trim_note_sequence(ns, 0, num_seconds)
+def _synthesize_example_notes(
+    examples: Sequence[Mapping[str, Sequence[float]]],
+    ns_feature_name: str,
+    note_onset_feature_name: str,
+    note_offset_feature_name: str,
+    note_frequency_feature_name: str,
+    note_confidence_feature_name: str,
+    sample_rate: float,
+    num_seconds: float,
+) -> np.ndarray:
+  """Synthesize example notes to audio.
+  Args:
+    examples: List of example dictionaries, containing either serialized
+        NoteSequence protos or note onset times and pitches.
+    ns_feature_name: Name of serialized NoteSequence feature.
+    note_onset_feature_name: Name of note onset times feature.
+    note_offset_feature_name: Name of note offset times feature.
+    note_frequency_feature_name: Name of note frequencies feature.
+    note_confidence_feature_name: Name of note confidences (velocities) feature.
+    sample_rate: Sample rate at which to synthesize.
+    num_seconds: Number of seconds to synthesize for each example.
+  Returns:
+    An n-by-num_samples numpy array of samples.
+  """
+  if (ns_feature_name is not None) == (note_onset_feature_name is not None):
+    raise ValueError(
+        'must specify exactly one of NoteSequence feature and onset feature')
+  n = len(examples)
+  num_samples = round(num_seconds * sample_rate)
+  all_samples = np.zeros([n, num_samples])
+  for i, ex in enumerate(examples):
+    ns = _example_to_note_sequence(
+        ex,
+        ns_feature_name=ns_feature_name,
+        note_onset_feature_name=note_onset_feature_name,
+        note_offset_feature_name=note_offset_feature_name,
+        note_frequency_feature_name=note_frequency_feature_name,
+        note_confidence_feature_name=note_confidence_feature_name,
+        num_seconds=num_seconds)
+    fluidsynth = midi_synth.fluidsynth
+    samples = fluidsynth(ns, sample_rate=sample_rate)
+    if len(samples) > num_samples:
+      samples = samples[:num_samples]
+    all_samples[i, :len(samples)] = samples
+  return all_samples
+def _examples_to_pianorolls(
+    targets: Sequence[Mapping[str, Sequence[float]]],
+    predictions: Sequence[Mapping[str, Sequence[float]]],
+    ns_feature_suffix: str,
+    note_onset_feature_suffix: str,
+    note_offset_feature_suffix: str,
+    note_frequency_feature_suffix: str,
+    note_confidence_feature_suffix: str,
+    track_specs: Optional[Sequence[note_sequences.TrackSpec]],
+    num_seconds: float,
+    frames_per_second: float
+) -> Tuple[np.ndarray, np.ndarray]:
+  """Generate pianoroll images from example notes.
+  Args:
+    targets: List of target dictionaries, containing either serialized
+        NoteSequence protos or note onset times and pitches.
+    predictions: List of prediction dictionaries, containing either serialized
+        NoteSequence protos or note onset times and pitches.
+    ns_feature_suffix: Suffix of serialized NoteSequence feature.
+    note_onset_feature_suffix: Suffix of note onset times feature.
+    note_offset_feature_suffix: Suffix of note offset times feature.
+    note_frequency_feature_suffix: Suffix of note frequencies feature.
+    note_confidence_feature_suffix: Suffix of note confidences (velocities)
+        feature.
+    track_specs: Optional list of TrackSpec objects to indicate a set of tracks
+        into which each NoteSequence should be split. Tracks will be stacked
+        vertically in the pianorolls
+    num_seconds: Number of seconds to show for each example.
+    frames_per_second: Number of pianoroll frames per second.
+  Returns:
+    onset_pianorolls: An n-by-num_pitches-by-num_frames-by-4 numpy array of
+        pianoroll images showing only onsets.
+    full_pianorolls: An n-by-num_pitches-by-num_frames-by-4 numpy array of
+        pianoroll images.
+  """
+  if (ns_feature_suffix is not None) == (note_onset_feature_suffix is not None):
+    raise ValueError(
+        'must specify exactly one of NoteSequence feature and onset feature')
+  def ex_to_ns(example, prefix):
+    return _example_to_note_sequence(
+        example=example,
+        ns_feature_name=(prefix + ns_feature_suffix
+                         if ns_feature_suffix else None),
+        note_onset_feature_name=(prefix + note_onset_feature_suffix
+                                 if note_onset_feature_suffix else None),
+        note_offset_feature_name=(prefix + note_offset_feature_suffix
+                                  if note_offset_feature_suffix else None),
+        note_frequency_feature_name=(
+            prefix + note_frequency_feature_suffix
+            if note_frequency_feature_suffix else None),
+        note_confidence_feature_name=(
+            prefix + note_confidence_feature_suffix
+            if note_confidence_feature_suffix else None),
+        num_seconds=num_seconds)
+  n = len(targets)
+  num_pitches = note_seq.MAX_MIDI_PITCH - note_seq.MIN_MIDI_PITCH + 1
+  num_frames = round(num_seconds * frames_per_second)
+  num_tracks = len(track_specs) if track_specs else 1
+  pianoroll_height = num_tracks * num_pitches + (num_tracks - 1)
+  onset_images = np.zeros([n, pianoroll_height, num_frames, 3])
+  full_images = np.zeros([n, pianoroll_height, num_frames, 3])
+  for i, (target, pred) in enumerate(zip(targets, predictions)):
+    target_ns, pred_ns = [
+        ex_to_ns(ex, prefix)
+        for (ex, prefix) in [(target, 'ref_'), (pred, 'est_')]
+    ]
+    # Show lines at frame boundaries. To ensure that these lines are drawn with
+    # the same downsampling and frame selection logic as the real NoteSequences,
+    # use this hack to draw the lines with a NoteSequence that contains notes
+    # across all pitches at all frame start times.
+    start_times_ns = note_seq.NoteSequence()
+    start_times_ns.CopyFrom(target_ns)
+    del start_times_ns.notes[:]
+    for start_time in pred['start_times']:
+      if start_time < target_ns.total_time:
+        for pitch in range(
+            note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH + 1):
+          start_times_ns.notes.add(
+              pitch=pitch,
+              velocity=100,
+              start_time=start_time,
+              end_time=start_time + (1 / frames_per_second))
+    start_time_roll = sequences_lib.sequence_to_pianoroll(
+        start_times_ns,
+        frames_per_second=frames_per_second,
+        min_pitch=note_seq.MIN_MIDI_PITCH,
+        max_pitch=note_seq.MAX_MIDI_PITCH,
+        onset_mode='length_ms')
+    num_start_time_frames = min(len(start_time_roll.onsets), num_frames)
+    if track_specs is not None:
+      target_tracks = [note_sequences.extract_track(target_ns,
+                                                    spec.program, spec.is_drum)
+                       for spec in track_specs]
+      pred_tracks = [note_sequences.extract_track(pred_ns,
+                                                  spec.program, spec.is_drum)
+                     for spec in track_specs]
+    else:
+      target_tracks = [target_ns]
+      pred_tracks = [pred_ns]
+    for j, (target_track, pred_track) in enumerate(zip(target_tracks[::-1],
+                                                       pred_tracks[::-1])):
+      target_roll = sequences_lib.sequence_to_pianoroll(
+          target_track,
+          frames_per_second=frames_per_second,
+          min_pitch=note_seq.MIN_MIDI_PITCH,
+          max_pitch=note_seq.MAX_MIDI_PITCH,
+          onset_mode='length_ms')
+      pred_roll = sequences_lib.sequence_to_pianoroll(
+          pred_track,
+          frames_per_second=frames_per_second,
+          min_pitch=note_seq.MIN_MIDI_PITCH,
+          max_pitch=note_seq.MAX_MIDI_PITCH,
+          onset_mode='length_ms')
+      num_target_frames = min(len(target_roll.onsets), num_frames)
+      num_pred_frames = min(len(pred_roll.onsets), num_frames)
+      start_offset = j * (num_pitches + 1)
+      end_offset = (j + 1) * (num_pitches + 1) - 1
+      # Onsets
+      onset_images[
+          i, start_offset:end_offset, :num_start_time_frames, 0
+      ] = start_time_roll.onsets[:num_start_time_frames, :].T
+      onset_images[
+          i, start_offset:end_offset, :num_target_frames, 1
+      ] = target_roll.onsets[:num_target_frames, :].T
+      onset_images[
+          i, start_offset:end_offset, :num_pred_frames, 2
+      ] = pred_roll.onsets[:num_pred_frames, :].T
+      # Full notes
+      full_images[
+          i, start_offset:end_offset, :num_start_time_frames, 0
+      ] = start_time_roll.onsets[:num_start_time_frames, :].T
+      full_images[
+          i, start_offset:end_offset, :num_target_frames, 1
+      ] = target_roll.active[:num_target_frames, :].T
+      full_images[
+          i, start_offset:end_offset, :num_pred_frames, 2
+      ] = pred_roll.active[:num_pred_frames, :].T
+      # Add separator between tracks.
+      if j < num_tracks - 1:
+        onset_images[i, end_offset, :, 0] = 1
+        full_images[i, end_offset, :, 0] = 1
+  return onset_images[:, ::-1, :, :], full_images[:, ::-1, :, :]
+def prettymidi_pianoroll(
+    track_pianorolls: Mapping[str, Sequence[Tuple[np.ndarray, np.ndarray]]],
+    fps: float,
+    num_seconds=_DEFAULT_AUDIO_SECONDS
+) -> Mapping[str, seqio.metrics.MetricValue]:
+  """Create summary from given pianorolls."""
+  max_len = int(num_seconds * fps)
+  summaries = {}
+  for inst_name, all_prs in track_pianorolls.items():
+    est_prs, ref_prs = zip(*all_prs)
+    bs = len(ref_prs)
+    pianoroll_image_batch = np.zeros(shape=(bs, 128, max_len, 3))
+    for i in range(bs):
+      ref_pr = ref_prs[i][:, :max_len]
+      est_pr = est_prs[i][:, :max_len]
+      pianoroll_image_batch[i, :, :est_pr.shape[1], 2] = est_pr
+      pianoroll_image_batch[i, :, :ref_pr.shape[1], 1] = ref_pr
+    if not inst_name:
+      inst_name = 'all instruments'
+    summaries[f'{inst_name} pretty_midi pianoroll'] = seqio.metrics.Image(
+        image=pianoroll_image_batch, max_outputs=bs)
+  return summaries
+def audio_summaries(
+    targets: Sequence[Mapping[str, Sequence[float]]],
+    predictions: Sequence[Mapping[str, Sequence[float]]],
+    spectrogram_config: spectrograms.SpectrogramConfig,
+    num_seconds: float = _DEFAULT_AUDIO_SECONDS
+) -> Mapping[str, seqio.metrics.MetricValue]:
+  """Compute audio summaries for a list of examples.
+  Args:
+    targets: List of targets, unused as we pass the input audio tokens via
+        predictions.
+    predictions: List of predictions, including input audio tokens.
+    spectrogram_config: Spectrogram configuration.
+    num_seconds: Number of seconds of audio to include in the summaries.
+        Longer audio will be cropped (from the beginning), shorter audio will be
+        padded with silence (at the end).
+  Returns:
+    A dictionary mapping "audio" to the audio summaries.
+  """
+  del targets
+  samples = _extract_example_audio(
+      examples=predictions,
+      sample_rate=spectrogram_config.sample_rate,
+      num_seconds=num_seconds)
+  return {
+      'audio': seqio.metrics.Audio(
+          audiodata=samples[:, :, np.newaxis],
+          sample_rate=spectrogram_config.sample_rate,
+          max_outputs=samples.shape[0])
+  }
+def transcription_summaries(
+    targets: Sequence[Mapping[str, Sequence[float]]],
+    predictions: Sequence[Mapping[str, Sequence[float]]],
+    spectrogram_config: spectrograms.SpectrogramConfig,
+    ns_feature_suffix: Optional[str] = None,
+    note_onset_feature_suffix: Optional[str] = None,
+    note_offset_feature_suffix: Optional[str] = None,
+    note_frequency_feature_suffix: Optional[str] = None,
+    note_confidence_feature_suffix: Optional[str] = None,
+    track_specs: Optional[Sequence[note_sequences.TrackSpec]] = None,
+    num_seconds: float = _DEFAULT_AUDIO_SECONDS,
+    pianoroll_frames_per_second: float = _DEFAULT_PIANOROLL_FRAMES_PER_SECOND,
+) -> Mapping[str, seqio.metrics.MetricValue]:
+  """Compute note transcription summaries for multiple examples.
+  Args:
+    targets: List of targets containing ground truth.
+    predictions: List of predictions, including raw input audio.
+    spectrogram_config: The spectrogram configuration.
+    ns_feature_suffix: Suffix of serialized NoteSequence feature.
+    note_onset_feature_suffix: Suffix of note onset times feature.
+    note_offset_feature_suffix: Suffix of note offset times feature.
+    note_frequency_feature_suffix: Suffix of note frequencies feature.
+    note_confidence_feature_suffix: Suffix of note confidences (velocities)
+        feature.
+    track_specs: Optional list of TrackSpec objects to indicate a set of tracks
+        into which each NoteSequence should be split.
+    num_seconds: Number of seconds of audio to include in the summaries.
+        Longer audio will be cropped (from the beginning), shorter audio will be
+        padded with silence (at the end).
+    pianoroll_frames_per_second: Temporal resolution of pianoroll images.
+  Returns:
+    A dictionary of input, ground truth, and transcription summaries.
+  """
+  audio_samples = _extract_example_audio(
+      examples=predictions,
+      sample_rate=spectrogram_config.sample_rate,
+      num_seconds=num_seconds)
+  def synthesize(examples, prefix):
+    return _synthesize_example_notes(
+        examples=examples,
+        ns_feature_name=(prefix + ns_feature_suffix
+                         if ns_feature_suffix else None),
+        note_onset_feature_name=(prefix + note_onset_feature_suffix
+                                 if note_onset_feature_suffix else None),
+        note_offset_feature_name=(prefix + note_offset_feature_suffix
+                                  if note_offset_feature_suffix else None),
+        note_frequency_feature_name=(
+            prefix + note_frequency_feature_suffix
+            if note_frequency_feature_suffix else None),
+        note_confidence_feature_name=(
+            prefix + note_confidence_feature_suffix
+            if note_confidence_feature_suffix else None),
+        sample_rate=spectrogram_config.sample_rate,
+        num_seconds=num_seconds)
+  synthesized_predictions = synthesize(predictions, 'est_')
+  onset_pianoroll_images, full_pianoroll_images = _examples_to_pianorolls(
+      targets=targets,
+      predictions=predictions,
+      ns_feature_suffix=ns_feature_suffix,
+      note_onset_feature_suffix=note_onset_feature_suffix,
+      note_offset_feature_suffix=note_offset_feature_suffix,
+      note_frequency_feature_suffix=note_frequency_feature_suffix,
+      note_confidence_feature_suffix=note_confidence_feature_suffix,
+      track_specs=track_specs,
+      num_seconds=num_seconds,
+      frames_per_second=pianoroll_frames_per_second)
+  return {
+      'input_with_transcription': seqio.metrics.Audio(
+          audiodata=np.stack([audio_samples, synthesized_predictions], axis=2),
+          sample_rate=spectrogram_config.sample_rate,
+          max_outputs=audio_samples.shape[0]),
+      'pianoroll': seqio.metrics.Image(
+          image=full_pianoroll_images,
+          max_outputs=full_pianoroll_images.shape[0]),
+      'onset_pianoroll': seqio.metrics.Image(
+          image=onset_pianoroll_images,
+          max_outputs=onset_pianoroll_images.shape[0]),
+  }

mt3/tasks.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transcription task definitions."""
+import functools
+from typing import Optional, Sequence
+from mt3 import datasets
+from mt3 import event_codec
+from mt3 import metrics
+from mt3 import mixing
+from mt3 import preprocessors
+from mt3 import run_length_encoding
+from mt3 import spectrograms
+from mt3 import vocabularies
+import note_seq
+import numpy as np
+import seqio
+import t5
+import tensorflow as tf
+# Split audio frame sequences into this length before the cache placeholder.
+MAX_NUM_CACHED_FRAMES = 2000
+seqio.add_global_cache_dirs(['gs://mt3/data/cache_tasks/'])
+def construct_task_name(
+    task_prefix: str,
+    spectrogram_config=spectrograms.SpectrogramConfig(),
+    vocab_config=vocabularies.VocabularyConfig(),
+    task_suffix: Optional[str] = None
+) -> str:
+  """Construct task name from prefix, config, and optional suffix."""
+  fields = [task_prefix]
+  if spectrogram_config.abbrev_str:
+    fields.append(spectrogram_config.abbrev_str)
+  if vocab_config.abbrev_str:
+    fields.append(vocab_config.abbrev_str)
+  if task_suffix:
+    fields.append(task_suffix)
+  return '_'.join(fields)
+def trim_eos(tokens: Sequence[int]) -> np.ndarray:
+  """If EOS is present, remove it and everything after."""
+  tokens = np.array(tokens, np.int32)
+  if vocabularies.DECODED_EOS_ID in tokens:
+    tokens = tokens[:np.argmax(tokens == vocabularies.DECODED_EOS_ID)]
+  return tokens
+def postprocess(tokens, example, is_target, codec):
+  """Transcription postprocessing function."""
+  tokens = trim_eos(tokens)
+  if is_target:
+    return {
+        'unique_id': example['unique_id'][0],
+        'ref_ns': (note_seq.NoteSequence.FromString(example['sequence'][0])
+                   if example['sequence'][0] else None),
+        'ref_tokens': tokens,
+    }
+  start_time = example['input_times'][0]
+  # Round down to nearest symbolic token step.
+  start_time -= start_time % (1 / codec.steps_per_second)
+  return {
+      'unique_id': example['unique_id'][0],
+      'raw_inputs': example['raw_inputs'],
+      'est_tokens': tokens,
+      'start_time': start_time
+  }
+def add_transcription_task_to_registry(
+    dataset_config: datasets.DatasetConfig,
+    spectrogram_config: spectrograms.SpectrogramConfig,
+    vocab_config: vocabularies.VocabularyConfig,
+    tokenize_fn,  # TODO(iansimon): add type signature
+    onsets_only: bool,
+    include_ties: bool,
+    skip_too_long: bool = False
+) -> None:
+  """Add note transcription task to seqio.TaskRegistry."""
+  codec = vocabularies.build_codec(vocab_config)
+  vocabulary = vocabularies.vocabulary_from_codec(codec)
+  output_features = {
+      'targets': seqio.Feature(vocabulary=vocabulary),
+      'inputs': seqio.ContinuousFeature(dtype=tf.float32, rank=2)
+  }
+  task_name = 'onsets' if onsets_only else 'notes'
+  if include_ties:
+    task_name += '_ties'
+  task_prefix = f'{dataset_config.name}_{task_name}'
+  train_task_name = construct_task_name(
+      task_prefix=task_prefix,
+      spectrogram_config=spectrogram_config,
+      vocab_config=vocab_config,
+      task_suffix='train')
+  mixture_task_names = []
+  tie_token = codec.encode_event(event_codec.Event('tie', 0))
+  track_specs = (dataset_config.track_specs
+                 if dataset_config.track_specs else None)
+  # Add transcription training task.
+  seqio.TaskRegistry.add(
+      train_task_name,
+      source=seqio.TFExampleDataSource(
+          split_to_filepattern={
+              'train': dataset_config.paths[dataset_config.train_split],
+              'eval': dataset_config.paths[dataset_config.train_eval_split]
+          },
+          feature_description=dataset_config.features),
+      output_features=output_features,
+      preprocessors=[
+          functools.partial(
+              tokenize_fn,
+              spectrogram_config=spectrogram_config, codec=codec,
+              is_training_data=True, onsets_only=onsets_only,
+              include_ties=include_ties),
+          functools.partial(
+              t5.data.preprocessors.split_tokens,
+              max_tokens_per_segment=MAX_NUM_CACHED_FRAMES,
+              feature_key='inputs',
+              additional_feature_keys=[
+                  'input_event_start_indices', 'input_event_end_indices',
+                  'input_state_event_indices'
+              ],
+              passthrough_feature_keys=['targets', 'state_events']),
+          seqio.CacheDatasetPlaceholder(),
+          functools.partial(
+              t5.data.preprocessors.select_random_chunk,
+              feature_key='inputs',
+              additional_feature_keys=[
+                  'input_event_start_indices', 'input_event_end_indices',
+                  'input_state_event_indices'
+              ],
+              passthrough_feature_keys=['targets', 'state_events'],
+              uniform_random_start=True),
+          functools.partial(
+              run_length_encoding.extract_target_sequence_with_indices,
+              state_events_end_token=tie_token if include_ties else None),
+          functools.partial(preprocessors.map_midi_programs, codec=codec),
+          run_length_encoding.run_length_encode_shifts_fn(
+              codec,
+              feature_key='targets'),
+          functools.partial(
+              mixing.mix_transcription_examples,
+              codec=codec,
+              targets_feature_keys=['targets']),
+          run_length_encoding.remove_redundant_state_changes_fn(
+              feature_key='targets', codec=codec,
+              state_change_event_types=['velocity', 'program']),
+          functools.partial(
+              preprocessors.compute_spectrograms,
+              spectrogram_config=spectrogram_config),
+          functools.partial(preprocessors.handle_too_long, skip=skip_too_long),
+          functools.partial(
+              seqio.preprocessors.tokenize_and_append_eos,
+              copy_pretokenized=False)
+      ],
+      postprocess_fn=None,
+      metric_fns=[],
+  )
+  # Add transcription eval tasks.
+  for split in dataset_config.infer_eval_splits:
+    eval_task_name = construct_task_name(
+        task_prefix=task_prefix,
+        spectrogram_config=spectrogram_config,
+        vocab_config=vocab_config,
+        task_suffix=split.suffix)
+    if split.include_in_mixture:
+      mixture_task_names.append(eval_task_name)
+    seqio.TaskRegistry.add(
+        eval_task_name,
+        source=seqio.TFExampleDataSource(
+            split_to_filepattern={'eval': dataset_config.paths[split.name]},
+            feature_description=dataset_config.features),
+        output_features=output_features,
+        preprocessors=[
+            functools.partial(
+                tokenize_fn,
+                spectrogram_config=spectrogram_config, codec=codec,
+                is_training_data='train' in split.name, onsets_only=onsets_only,
+                include_ties=include_ties),
+            seqio.CacheDatasetPlaceholder(),
+            preprocessors.add_unique_id,
+            preprocessors.pad_notesequence_array,
+            functools.partial(
+                t5.data.preprocessors.split_tokens_to_inputs_length,
+                feature_key='inputs',
+                additional_feature_keys=['input_times', 'sequence'],
+                passthrough_feature_keys=['unique_id']),
+            # Add dummy targets as they are dropped during the above split to
+            # avoid memory blowups, but expected to be present by seqio; the
+            # evaluation metrics currently only use the target NoteSequence.
+            preprocessors.add_dummy_targets,
+            functools.partial(
+                preprocessors.compute_spectrograms,
+                spectrogram_config=spectrogram_config),
+            functools.partial(preprocessors.handle_too_long, skip=False),
+            functools.partial(
+                seqio.preprocessors.tokenize_and_append_eos,
+                copy_pretokenized=False)
+        ],
+        postprocess_fn=functools.partial(postprocess, codec=codec),
+        metric_fns=[
+            functools.partial(
+                metrics.transcription_metrics,
+                codec=codec,
+                spectrogram_config=spectrogram_config,
+                onsets_only=onsets_only,
+                use_ties=include_ties,
+                track_specs=track_specs)
+        ],
+    )
+  seqio.MixtureRegistry.add(
+      construct_task_name(
+          task_prefix=task_prefix, spectrogram_config=spectrogram_config,
+          vocab_config=vocab_config, task_suffix='eval'),
+      mixture_task_names,
+      default_rate=1)
+# Just use default spectrogram config.
+SPECTROGRAM_CONFIG = spectrograms.SpectrogramConfig()
+# Create two vocabulary configs, one default and one with only on-off velocity.
+VOCAB_CONFIG_FULL = vocabularies.VocabularyConfig()
+VOCAB_CONFIG_NOVELOCITY = vocabularies.VocabularyConfig(num_velocity_bins=1)
+# Transcribe MAESTRO v1.
+add_transcription_task_to_registry(
+    dataset_config=datasets.MAESTROV1_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_FULL,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_transcription_example,
+        audio_is_samples=False,
+        id_feature_key='id'),
+    onsets_only=False,
+    include_ties=False)
+# Transcribe MAESTRO v3.
+add_transcription_task_to_registry(
+    dataset_config=datasets.MAESTROV3_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_FULL,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_transcription_example,
+        audio_is_samples=False,
+        id_feature_key='id'),
+    onsets_only=False,
+    include_ties=False)
+# Transcribe MAESTRO v3 without velocities, with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.MAESTROV3_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_transcription_example,
+        audio_is_samples=False,
+        id_feature_key='id'),
+    onsets_only=False,
+    include_ties=True)
+# Transcribe GuitarSet, with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.GUITARSET_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=preprocessors.tokenize_guitarset_example,
+    onsets_only=False,
+    include_ties=True)
+# Transcribe URMP mixes, with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.URMP_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_example_with_program_lookup,
+        inst_name_to_program_fn=preprocessors.urmp_instrument_to_program,
+        id_feature_key='id'),
+    onsets_only=False,
+    include_ties=True)
+# Transcribe MusicNet, with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.MUSICNET_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_transcription_example,
+        audio_is_samples=True,
+        id_feature_key='id'),
+    onsets_only=False,
+    include_ties=True)
+# Transcribe MusicNetEM, with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.MUSICNET_EM_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_transcription_example,
+        audio_is_samples=True,
+        id_feature_key='id'),
+    onsets_only=False,
+    include_ties=True)
+# Transcribe Cerberus4 (piano-guitar-bass-drums quartets), with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.CERBERUS4_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_slakh_example,
+        track_specs=datasets.CERBERUS4_CONFIG.track_specs,
+        ignore_pitch_bends=True),
+    onsets_only=False,
+    include_ties=True)
+# Transcribe 10 random sub-mixes of each song from Slakh, with ties.
+add_transcription_task_to_registry(
+    dataset_config=datasets.SLAKH_CONFIG,
+    spectrogram_config=SPECTROGRAM_CONFIG,
+    vocab_config=VOCAB_CONFIG_NOVELOCITY,
+    tokenize_fn=functools.partial(
+        preprocessors.tokenize_slakh_example,
+        track_specs=None,
+        ignore_pitch_bends=True),
+    onsets_only=False,
+    include_ties=True)
+# Construct task names to include in transcription mixture.
+MIXTURE_DATASET_NAMES = [
+    'maestrov3', 'guitarset', 'urmp', 'musicnet_em', 'cerberus4', 'slakh'
+]
+MIXTURE_TRAIN_TASK_NAMES = []
+MIXTURE_EVAL_TASK_NAMES = []
+MIXTURE_TEST_TASK_NAMES = []
+for dataset_name in MIXTURE_DATASET_NAMES:
+  MIXTURE_TRAIN_TASK_NAMES.append(
+      construct_task_name(task_prefix=f'{dataset_name}_notes_ties',
+                          spectrogram_config=SPECTROGRAM_CONFIG,
+                          vocab_config=VOCAB_CONFIG_NOVELOCITY,
+                          task_suffix='train'))
+  MIXTURE_EVAL_TASK_NAMES.append(
+      construct_task_name(task_prefix=f'{dataset_name}_notes_ties',
+                          spectrogram_config=SPECTROGRAM_CONFIG,
+                          vocab_config=VOCAB_CONFIG_NOVELOCITY,
+                          task_suffix='validation'))
+MIXING_TEMPERATURE = 10 / 3
+# Add the mixture of all transcription tasks, with ties.
+seqio.MixtureRegistry.add(
+    construct_task_name(
+        task_prefix='mega_notes_ties',
+        spectrogram_config=SPECTROGRAM_CONFIG,
+        vocab_config=VOCAB_CONFIG_NOVELOCITY,
+        task_suffix='train'),
+    MIXTURE_TRAIN_TASK_NAMES,
+    default_rate=functools.partial(
+        seqio.mixing_rate_num_examples,
+        temperature=MIXING_TEMPERATURE))
+seqio.MixtureRegistry.add(
+    construct_task_name(
+        task_prefix='mega_notes_ties',
+        spectrogram_config=SPECTROGRAM_CONFIG,
+        vocab_config=VOCAB_CONFIG_NOVELOCITY,
+        task_suffix='eval'),
+    MIXTURE_EVAL_TASK_NAMES,
+    default_rate=functools.partial(
+        seqio.mixing_rate_num_examples,
+        temperature=MIXING_TEMPERATURE))

mt3/version.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MT3 version."""
+__version__ = '0.0.1'

mt3/vocabularies.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model vocabulary."""
+import dataclasses
+import math
+from typing import Callable, Optional, Sequence
+from mt3 import event_codec
+import note_seq
+import seqio
+import t5.data
+import tensorflow as tf
+DECODED_EOS_ID = -1
+DECODED_INVALID_ID = -2
+# defaults for vocabulary config
+DEFAULT_STEPS_PER_SECOND = 100
+DEFAULT_MAX_SHIFT_SECONDS = 10
+DEFAULT_NUM_VELOCITY_BINS = 127
+@dataclasses.dataclass
+class VocabularyConfig:
+  """Vocabulary configuration parameters."""
+  steps_per_second: int = DEFAULT_STEPS_PER_SECOND
+  max_shift_seconds: int = DEFAULT_MAX_SHIFT_SECONDS
+  num_velocity_bins: int = DEFAULT_NUM_VELOCITY_BINS
+  @property
+  def abbrev_str(self):
+    s = ''
+    if self.steps_per_second != DEFAULT_STEPS_PER_SECOND:
+      s += 'ss%d' % self.steps_per_second
+    if self.max_shift_seconds != DEFAULT_MAX_SHIFT_SECONDS:
+      s += 'ms%d' % self.max_shift_seconds
+    if self.num_velocity_bins != DEFAULT_NUM_VELOCITY_BINS:
+      s += 'vb%d' % self.num_velocity_bins
+    return s
+def num_velocity_bins_from_codec(codec: event_codec.Codec):
+  """Get number of velocity bins from event codec."""
+  lo, hi = codec.event_type_range('velocity')
+  return hi - lo
+def velocity_to_bin(velocity, num_velocity_bins):
+  if velocity == 0:
+    return 0
+  else:
+    return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
+def bin_to_velocity(velocity_bin, num_velocity_bins):
+  if velocity_bin == 0:
+    return 0
+  else:
+    return int(note_seq.MAX_MIDI_VELOCITY * velocity_bin / num_velocity_bins)
+def drop_programs(tokens, codec: event_codec.Codec):
+  """Drops program change events from a token sequence."""
+  min_program_id, max_program_id = codec.event_type_range('program')
+  return tokens[(tokens < min_program_id) | (tokens > max_program_id)]
+def programs_to_midi_classes(tokens, codec):
+  """Modifies program events to be the first program in the MIDI class."""
+  min_program_id, max_program_id = codec.event_type_range('program')
+  is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
+  return tf.where(
+      is_program,
+      min_program_id + 8 * ((tokens - min_program_id) // 8),
+      tokens)
+@dataclasses.dataclass
+class ProgramGranularity:
+  # both tokens_map_fn and program_map_fn should be idempotent
+  tokens_map_fn: Callable[[Sequence[int], event_codec.Codec], Sequence[int]]
+  program_map_fn: Callable[[int], int]
+PROGRAM_GRANULARITIES = {
+    # "flat" granularity; drop program change tokens and set NoteSequence
+    # programs to zero
+    'flat': ProgramGranularity(
+        tokens_map_fn=drop_programs,
+        program_map_fn=lambda program: 0),
+    # map each program to the first program in its MIDI class
+    'midi_class': ProgramGranularity(
+        tokens_map_fn=programs_to_midi_classes,
+        program_map_fn=lambda program: 8 * (program // 8)),
+    # leave programs as is
+    'full': ProgramGranularity(
+        tokens_map_fn=lambda tokens, codec: tokens,
+        program_map_fn=lambda program: program)
+}
+def build_codec(vocab_config: VocabularyConfig):
+  """Build event codec."""
+  event_ranges = [
+      event_codec.EventRange('pitch', note_seq.MIN_MIDI_PITCH,
+                             note_seq.MAX_MIDI_PITCH),
+      # velocity bin 0 is used for note-off
+      event_codec.EventRange('velocity', 0, vocab_config.num_velocity_bins),
+      # used to indicate that a pitch is present at the beginning of a segment
+      # (only has an "off" event as when using ties all pitch events until the
+      # "tie" event belong to the tie section)
+      event_codec.EventRange('tie', 0, 0),
+      event_codec.EventRange('program', note_seq.MIN_MIDI_PROGRAM,
+                             note_seq.MAX_MIDI_PROGRAM),
+      event_codec.EventRange('drum', note_seq.MIN_MIDI_PITCH,
+                             note_seq.MAX_MIDI_PITCH),
+  ]
+  return event_codec.Codec(
+      max_shift_steps=(vocab_config.steps_per_second *
+                       vocab_config.max_shift_seconds),
+      steps_per_second=vocab_config.steps_per_second,
+      event_ranges=event_ranges)
+def vocabulary_from_codec(codec: event_codec.Codec) -> seqio.Vocabulary:
+  return GenericTokenVocabulary(
+      codec.num_classes, extra_ids=t5.data.DEFAULT_EXTRA_IDS)
+class GenericTokenVocabulary(seqio.Vocabulary):
+  """Vocabulary with pass-through encoding of tokens."""
+  def __init__(self, regular_ids: int, extra_ids: int = 0):
+    # The special tokens: 0=PAD, 1=EOS, and 2=UNK
+    self._num_special_tokens = 3
+    self._num_regular_tokens = regular_ids
+    super().__init__(extra_ids=extra_ids)
+  @property
+  def eos_id(self) -> Optional[int]:
+    return 1
+  @property
+  def unk_id(self) -> Optional[int]:
+    return 2
+  @property
+  def _base_vocab_size(self) -> int:
+    """Number of ids.
+    Returns:
+      an integer, the vocabulary size
+    """
+    return self._num_special_tokens + self._num_regular_tokens
+  def _encode(self, token_ids: Sequence[int]) -> Sequence[int]:
+    """Encode a list of tokens ids as a list of integers.
+    To keep the first few ids for special tokens, increase ids by the number
+    of special tokens.
+    Args:
+      token_ids: array of token ids.
+    Returns:
+      a list of integers (not terminated by EOS)
+    """
+    encoded = []
+    for token_id in token_ids:
+      if not 0 <= token_id < self._num_regular_tokens:
+        raise ValueError(
+            f'token_id {token_id} does not fall within valid range of '
+            f'[0, {self._num_regular_tokens})')
+      encoded.append(token_id + self._num_special_tokens)
+    return encoded
+  def _decode(self, ids: Sequence[int]) -> Sequence[int]:
+    """Decode a list of integers to a list of token ids.
+    The special tokens of PAD and UNK as well as extra_ids will be
+    replaced with DECODED_INVALID_ID in the output. If EOS is present, it will
+    be the final token in the decoded output and will be represented by
+    DECODED_EOS_ID.
+    Args:
+      ids: a list of integers
+    Returns:
+      a list of token ids.
+    """
+    # convert all the extra ids  to INVALID_ID
+    def _decode_id(encoded_id):
+      if encoded_id == self.eos_id:
+        return DECODED_EOS_ID
+      elif encoded_id < self._num_special_tokens:
+        return DECODED_INVALID_ID
+      elif encoded_id >= self._base_vocab_size:
+        return DECODED_INVALID_ID
+      else:
+        return encoded_id - self._num_special_tokens
+    ids = [_decode_id(int(i)) for i in ids]
+    return ids
+  def _encode_tf(self, token_ids: tf.Tensor) -> tf.Tensor:
+    """Encode a list of tokens to a tf.Tensor.
+    Args:
+      token_ids: array of audio token ids.
+    Returns:
+      a 1d tf.Tensor with dtype tf.int32
+    """
+    with tf.control_dependencies(
+        [tf.debugging.assert_less(
+            token_ids, tf.cast(self._num_regular_tokens, token_ids.dtype)),
+         tf.debugging.assert_greater_equal(
+             token_ids, tf.cast(0, token_ids.dtype))
+         ]):
+      tf_ids = token_ids + self._num_special_tokens
+    return tf_ids
+  def _decode_tf(self, ids: tf.Tensor) -> tf.Tensor:
+    """Decode in TensorFlow.
+    The special tokens of PAD and UNK as well as extra_ids will be
+    replaced with DECODED_INVALID_ID in the output. If EOS is present, it and
+    all following tokens in the decoded output and will be represented by
+    DECODED_EOS_ID.
+    Args:
+      ids: a 1d tf.Tensor with dtype tf.int32
+    Returns:
+      a 1d tf.Tensor with dtype tf.int32
+    """
+    # Create a mask that is true from the first EOS position onward.
+    # First, create an array that is True whenever there is an EOS, then cumsum
+    # that array so that every position after and including the first True is
+    # >1, then cast back to bool for the final mask.
+    eos_and_after = tf.cumsum(
+        tf.cast(tf.equal(ids, self.eos_id), tf.int32), exclusive=False, axis=-1)
+    eos_and_after = tf.cast(eos_and_after, tf.bool)
+    return tf.where(
+        eos_and_after,
+        DECODED_EOS_ID,
+        tf.where(
+            tf.logical_and(
+                tf.greater_equal(ids, self._num_special_tokens),
+                tf.less(ids, self._base_vocab_size)),
+            ids - self._num_special_tokens,
+            DECODED_INVALID_ID))
+  def __eq__(self, other):
+    their_extra_ids = other.extra_ids
+    their_num_regular_tokens = other._num_regular_tokens
+    return (self.extra_ids == their_extra_ids and
+            self._num_regular_tokens == their_num_regular_tokens)
+def num_embeddings(vocabulary: GenericTokenVocabulary) -> int:
+  """Vocabulary size as a multiple of 128 for TPU efficiency."""
+  return 128 * math.ceil(vocabulary.vocab_size / 128)

mt3/vocabularies_test.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for vocabularies."""
+from absl.testing import absltest
+from mt3 import vocabularies
+import numpy as np
+import tensorflow.compat.v2 as tf
+tf.compat.v1.enable_eager_execution()
+class VocabulariesTest(absltest.TestCase):
+  def test_velocity_quantization(self):
+    self.assertEqual(0, vocabularies.velocity_to_bin(0, num_velocity_bins=1))
+    self.assertEqual(0, vocabularies.velocity_to_bin(0, num_velocity_bins=127))
+    self.assertEqual(0, vocabularies.bin_to_velocity(0, num_velocity_bins=1))
+    self.assertEqual(0, vocabularies.bin_to_velocity(0, num_velocity_bins=127))
+    self.assertEqual(
+        1,
+        vocabularies.velocity_to_bin(
+            vocabularies.bin_to_velocity(1, num_velocity_bins=1),
+            num_velocity_bins=1))
+    for velocity_bin in range(1, 128):
+      self.assertEqual(
+          velocity_bin,
+          vocabularies.velocity_to_bin(
+              vocabularies.bin_to_velocity(velocity_bin, num_velocity_bins=127),
+              num_velocity_bins=127))
+  def test_encode_decode(self):
+    vocab = vocabularies.GenericTokenVocabulary(32)
+    input_tokens = [1, 2, 3]
+    expected_encoded = [4, 5, 6]
+    # Encode
+    self.assertSequenceEqual(vocab.encode(input_tokens), expected_encoded)
+    np.testing.assert_array_equal(
+        vocab.encode_tf(tf.convert_to_tensor(input_tokens)).numpy(),
+        expected_encoded)
+    # Decode
+    self.assertSequenceEqual(vocab.decode(expected_encoded), input_tokens)
+    np.testing.assert_array_equal(
+        vocab.decode_tf(tf.convert_to_tensor(expected_encoded)).numpy(),
+        input_tokens)
+  def test_decode_invalid_ids(self):
+    vocab = vocabularies.GenericTokenVocabulary(32, extra_ids=4)
+    encoded = [0, 2, 3, 4, 34, 35]
+    expected_decoded = [-2, -2, 0, 1, 31, -2]
+    self.assertSequenceEqual(vocab.decode(encoded), expected_decoded)
+    np.testing.assert_array_equal(
+        vocab.decode_tf(tf.convert_to_tensor(encoded)).numpy(),
+        expected_decoded)
+  def test_decode_eos(self):
+    vocab = vocabularies.GenericTokenVocabulary(32)
+    encoded = [0, 2, 3, 4, 1, 0, 1, 0]
+    # Python decode function truncates everything after first EOS.
+    expected_decoded = [-2, -2, 0, 1, -1]
+    self.assertSequenceEqual(vocab.decode(encoded), expected_decoded)
+    # TF decode function preserves array length.
+    expected_decoded_tf = [-2, -2, 0, 1, -1, -1, -1, -1]
+    np.testing.assert_array_equal(
+        vocab.decode_tf(tf.convert_to_tensor(encoded)).numpy(),
+        expected_decoded_tf)
+  def test_encode_invalid_id(self):
+    vocab = vocabularies.GenericTokenVocabulary(32)
+    inputs = [0, 15, 31]
+    # No exception expected.
+    vocab.encode(inputs)
+    vocab.encode_tf(tf.convert_to_tensor(inputs))
+    inputs_too_low = [-1, 15, 31]
+    with self.assertRaises(ValueError):
+      vocab.encode(inputs_too_low)
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      vocab.encode_tf(tf.convert_to_tensor(inputs_too_low))
+    inputs_too_high = [0, 15, 32]
+    with self.assertRaises(ValueError):
+      vocab.encode(inputs_too_high)
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      vocab.encode_tf(tf.convert_to_tensor(inputs_too_high))
+  def test_encode_dtypes(self):
+    vocab = vocabularies.GenericTokenVocabulary(32)
+    inputs = [0, 15, 31]
+    encoded32 = vocab.encode_tf(tf.convert_to_tensor(inputs, tf.int32))
+    self.assertEqual(tf.int32, encoded32.dtype)
+    encoded64 = vocab.encode_tf(tf.convert_to_tensor(inputs, tf.int64))
+    self.assertEqual(tf.int64, encoded64.dtype)
+if __name__ == '__main__':
+  absltest.main()

pytest.ini ADDED Viewed

	@@ -0,0 +1,3 @@

+[pytest]
+python_files = *_test.py
+log_level = INFO

setup.cfg ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [aliases]
2	+ test=pytest

setup.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright 2022 The MT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Install mt3."""
+import os
+import sys
+import setuptools
+# To enable importing version.py directly, we add its path to sys.path.
+version_path = os.path.join(os.path.dirname(__file__), 'mt3')
+sys.path.append(version_path)
+from version import __version__  # pylint: disable=g-import-not-at-top
+setuptools.setup(
+    name='mt3',
+    version=__version__,
+    description='Multi-Task Multitrack Music Transcription',
+    author='Google Inc.',
+    author_email='no-reply@google.com',
+    url='http://github.com/magenta/mt3',
+    license='Apache 2.0',
+    packages=setuptools.find_packages(),
+    package_data={
+        '': ['*.gin'],
+    },
+    scripts=[],
+    install_requires=[
+        'absl-py == 1.1.0',
+        'ddsp == 3.4.4',
+        'flax == 0.5.2',
+        'gin-config == 0.5.0',
+        'immutabledict == 2.2.1',
+        'librosa == 0.9.2',
+        'mir_eval == 0.7',
+        'note_seq == 0.0.3',
+        'numpy == 1.21.6',
+        'pretty_midi == 0.2.9',
+        'scikit-learn == 1.0.2',
+        'scipy == 1.7.3',
+        'seqio == 0.0.8',
+        't5 == 0.9.3',
+        'tensorflow',
+        'tensorflow-datasets == 4.6.0',
+    ],
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    ],
+    tests_require=['pytest'],
+    setup_requires=['pytest-runner'],
+    keywords='music transcription machinelearning audio',
+)

t5x/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Import API modules."""
+import t5x.adafactor
+import t5x.checkpoints
+import t5x.decoding
+import t5x.gin_utils
+import t5x.losses
+import t5x.models
+import t5x.partitioning
+import t5x.state_utils
+import t5x.train_state
+import t5x.trainer
+import t5x.utils
+# Version number.
+from t5x.version import __version__
+# TODO(adarob): Move clients to t5x.checkpointing and rename
+# checkpoints.py to checkpointing.py
+checkpointing = t5x.checkpoints

t5x/adafactor.py ADDED Viewed

	@@ -0,0 +1,608 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Adafactor Optimizer.
+Specialized Adafactor implementation for T5X with:
+ - custom factorization specification rules.
+ - support for stacked parameters from scanned layers and parameter fusions.
+Why do we need custom factorization?  In the Adafactor paper, scalar, vector and
+matrix parameters are considered. This is sufficiently general because higher
+dimensional parameters can be reshaped. In practice, there are situations where
+higher dimensional parameters are desirable.  For example, consider the
+multi-headed attention. It has projection kernels.  This is naturally
+represented as 3-dimensional array [d_model, num_head, head_dim]. Keeping the
+3-dimensional structure can be beneficial for performance optimization, e.g., by
+giving compilers additional degree of freedom to do layout optimization.
+The default heuristic behavior for the second-moment estimator can lead to an
+unexpected result because it assumes that the parameters are matrices (vectors
+and scalars are not factored). The dimensions are sorted and the smaller
+dimension is assigned to the row dim and the larger dim to the col dim (unless
+the two largest dims have an equal size and then the original ordering of the
+dimensions is used). Then `v_row` (i.e., the optimizer state for the row) is
+obtained by removing the col dim. In other words, `rank(v_row) = rank(v) - 1`.
+If the parameter is higher dimensional, v_row and v_col are higher dimensional.
+Therefore, the outer product of v_row and v_col do not necessarily corresponds
+to the row rank approximation that minimizes the generalized Kullback-Leibler
+divergence (the original Adafactor formulation).
+This Adafactor implementation generalized the default behavior such that we
+obtain the correct second moment estimator even for higher dimensional
+parameters.
+"""
+import enum
+import re
+from typing import Any, Mapping, Optional, Sequence, Tuple, Union
+from absl import logging
+from flax import struct
+from flax.core import freeze
+from flax.core import FrozenDict
+from flax.core import unfreeze
+from flax.serialization import from_state_dict
+from flax.serialization import to_state_dict
+from flax.traverse_util import flatten_dict
+from flax.traverse_util import unflatten_dict
+import jax
+import jax.numpy as jnp
+import numpy as np
+from t5x import utils
+from t5x.optimizers import OptimizerDef
+from t5x.optimizers import OptimizerState
+Dtype = Any
+class FactorDim(enum.Enum):
+  # Don't factorize this dimension.
+  NONE = None
+  # A batch-like dimension that we should not average over.
+  BATCH = 1
+  ROW = 2
+  COLUMN = 3
+# Sentinel value signifying the legacy heuristic factorization rule.
+class HeuristicRule(enum.Enum):
+  token = 1
+HEURISTIC_RULE = HeuristicRule.token
+FactorRule = Union[HeuristicRule, Tuple[FactorDim]]
+def _restore(target, flat):
+  state_dict = unflatten_dict({tuple(k.split('/')): v for k, v in flat.items()})
+  if isinstance(target, FrozenDict):
+    return freeze(state_dict)
+  else:
+    return state_dict
+def _insert(tpl, idx, x):
+  tmp = list(tpl)
+  tmp.insert(idx, x)
+  return tuple(tmp)
+def standard_logical_factor_rules():
+  return freeze({
+      'vocab': FactorDim.COLUMN,
+      'embed': FactorDim.ROW,
+      'mlp': FactorDim.COLUMN,
+      'heads': FactorDim.COLUMN,
+      'kv': FactorDim.COLUMN,
+      'joined_kv': FactorDim.COLUMN,
+      'relpos_buckets': FactorDim.NONE,
+      'layers': FactorDim.BATCH,  # used in scanned layers
+      'stack': FactorDim.BATCH,  # used in stacked params
+      # 'batch', 'length' should not occur in parameters
+      'q_wi_fused': FactorDim.COLUMN,
+      'o_wo_fused': FactorDim.COLUMN,
+      'multiquery_heads': FactorDim.COLUMN,
+      'kv_fused': FactorDim.COLUMN,
+      'layer_norm_scale': FactorDim.NONE,
+      'mlp_activations': FactorDim.COLUMN,
+  })
+def factor_name_to_factordim(name):
+  if not isinstance(name, str):
+    return name
+  name = name.lower()
+  return {
+      'row': FactorDim.ROW,
+      'col': FactorDim.COLUMN,
+      'column': FactorDim.COLUMN,
+      'batch': FactorDim.BATCH,
+      'none': FactorDim.NONE,
+      'unfactorized': FactorDim.NONE
+  }[name]
+class HParamMap:
+  """Maps parameter path names to hparams.
+  Names of parameters nested in a PyTree (e.g., an Optimizer) are formed by
+  joining the names along the path to the parameter leaf with '/'.
+  """
+  def __init__(self, rules):
+    self._rules = [(re.compile(r), p) for r, p in rules]
+  def __getitem__(self, key: str) -> Any:
+    for r, p in self._rules:
+      if r.search(key):
+        return p
+    raise KeyError(f'No factor rule found for parameter: {key}')
+  def __call__(self, params):
+    """Returns a copy of the params with mapped hparams in leaves."""
+    flat_state_dict = flatten_dict(to_state_dict(params))
+    flat_rules_dict = {k: self['/'.join(k)] for k in flat_state_dict.keys()}
+    return from_state_dict(params, unflatten_dict(flat_rules_dict))
+@struct.dataclass
+class _AdafactorHyperParams:
+  """Hparams for Adafactor optimizer."""
+  learning_rate: Optional[float]
+  factored: bool
+  multiply_by_parameter_scale: Union[bool, HParamMap]
+  beta1: Optional[float]
+  decay_rate: float
+  step_offset: int
+  clipping_threshold: Optional[float]
+  weight_decay_rate: Optional[float]
+  min_dim_size_to_factor: int
+  epsilon1: float
+  epsilon2: float
+  factor_map: Optional[HParamMap] = None
+  logical_factor_rules: Any = None
+  weight_decay_rate_lr_exponent: Optional[float] = None
+  global_norm_clip_threshold: Optional[float] = None
+  max_parameter_scale: Optional[float] = None
+  skip_nan_updates: Optional[bool] = False
+@struct.dataclass
+class _AdafactorParamState:
+  v_row: np.ndarray  # used in normal factored version
+  v_col: np.ndarray
+  v: np.ndarray  # only used without factoring
+  m: np.ndarray  # only used with momentum
+class Adafactor(OptimizerDef):
+  """Adafactor optimizer.
+  Adafactor is described in https://arxiv.org/abs/1804.04235.
+  """
+  def __init__(self,
+               learning_rate: Optional[float] = None,
+               factored: bool = True,
+               multiply_by_parameter_scale: Union[bool, HParamMap] = True,
+               beta1: Optional[float] = None,
+               decay_rate: float = 0.8,
+               step_offset: int = 0,
+               clipping_threshold: Optional[float] = 1.0,
+               weight_decay_rate: Optional[float] = None,
+               min_dim_size_to_factor: int = 128,
+               epsilon1: float = 1e-30,
+               epsilon2: float = 1e-3,
+               dtype_momentum: Dtype = jnp.float32,
+               factor_map: Optional[HParamMap] = None,
+               logical_factor_rules: Optional[Mapping[str, FactorDim]] = None,
+               weight_decay_rate_lr_exponent: Optional[float] = None,
+               global_norm_clip_threshold: Optional[float] = None,
+               max_parameter_scale: Optional[float] = None,
+               skip_nan_updates: Optional[bool] = False):
+    """Constructor for the Adafactor optimizer.
+    Args:
+      learning_rate: float: learning rate.  NB: the natural scale for adafactor
+        LR is markedly different from Adam, one doesn't use the 1/sqrt(hidden)
+        correction for this optimizer with attention-based models.
+      factored: boolean: whether to use factored second-moment estimator for 2d
+        variables.
+      multiply_by_parameter_scale: boolean: if True, then scale provided
+        learning_rate by parameter norm. if False, provided learning_rate is
+        absolute step size.
+      beta1: an optional float value between 0 and 1, enables momentum and uses
+        extra memory if non-None! None by default.
+      decay_rate: float: controls second-moment exponential decay schedule.
+      step_offset: for finetuning, one may optionally set this to the starting
+        step-number of the finetuning phase to reset the second moment
+        accumulators after pretraining. Does not affect the momentum even if it
+        was used during pretraining.
+      clipping_threshold: an optional float >= 1, if None no update clipping.
+      weight_decay_rate: optional rate at which to decay weights.
+      min_dim_size_to_factor: only factor accumulator if two array dimensions
+        are at least this size.
+      epsilon1: Regularization constant for squared gradient.
+      epsilon2: Regularization constant for parameter scale.
+      dtype_momentum: dtype of momentum buffers.
+      factor_map: hparam-map from key path to manual factorization rules.
+      logical_factor_rules: factorization rules provided as a set of mappings
+        from logical axis name to ROW, COLUMN, BATCH, or NONE.  Supercedes
+        factor_map if `set_param_axes` is called.
+      weight_decay_rate_lr_exponent: If present, weight decay rate is computed
+        as (learning_rate ** weight_decay_rate_lr_exponent).  If
+        weight_decay_rate is also present, then multiply by it.
+      global_norm_clip_threshold: If set, will clip gradients by global norm
+        before Adafactor stats are applied.
+      max_parameter_scale: If set, clips the parameter scale to a maximum value,
+        which helps prevent parameters from growing without bound.
+      skip_nan_updates: If set, any parameter that would have been updated by a
+        NaN value after a applying gradients will be kept with the earlier
+        value it had.
+    """
+    if not factored and factor_map is not None:
+      raise ValueError('Adafactor factored is False but factorization rules '
+                       'have been provided.')
+    if not isinstance(multiply_by_parameter_scale, (bool, HParamMap)):
+      raise TypeError(
+          '`multiply_by_parameter_scale` must be either bool or `HParamMap` '
+          f'type. Got {type(multiply_by_parameter_scale)}')
+    if not isinstance(factor_map, (type(None), HParamMap)):
+      raise TypeError(
+          '`factor_map` must be either None or `HParamMap` type. Got '
+          f'{type(factor_map)}')
+    hyper_params = _AdafactorHyperParams(
+        learning_rate, factored, multiply_by_parameter_scale, beta1, decay_rate,
+        step_offset, clipping_threshold, weight_decay_rate,
+        min_dim_size_to_factor, epsilon1, epsilon2, factor_map,
+        logical_factor_rules, weight_decay_rate_lr_exponent,
+        global_norm_clip_threshold, max_parameter_scale, skip_nan_updates)
+    self.dtype_momentum = jax.dtypes.canonicalize_dtype(dtype_momentum)
+    super().__init__(hyper_params)
+  @staticmethod
+  def _decay_rate_pow(i: int, exponent: float = 0.8) -> float:
+    """Default Adafactor second-moment decay schedule."""
+    t = jnp.array(i, jnp.float32) + 1.0
+    return 1.0 - t**(-exponent)
+  @staticmethod
+  def _parse_rule(
+      rule: Optional[FactorRule],
+      shape: Sequence[int],
+      path: str,
+      fallback_to_heuristics=True
+  ) -> Tuple[Tuple[int, ...], Optional[Union[HeuristicRule, Tuple[Tuple[
+      int, ...], Tuple[int, ...]]]]]:
+    """Parses specification and return factored dims and dims for averaging.
+    Adafactor needs to know the two largest dimensions to factorize along.
+    Traditionally it used a heuristic, but we want finer control over these
+    factorization dimensions.  Additionally, there are situations where
+    parameters are batched together for e.g. scanned layers and QKV fusion,
+    and we want to ensure that the scale updates and clipping thresholds are
+    calculated _within_ each array and not across the entire batched array.
+    Args:
+      rule: the rule is either None (default to heuristic behavior) or a tuple
+        of the same rank as the `param` array containing a FactorDim.ROW or
+        FactorDim.COLUMN to mark dimensions to factorize in two row and column
+        sets, and optionally dimensions marked FactorDim.BATCH to denote batched
+        dimensions that should not be averaged over. e.g. (BATCH, ROW, COLUMN,
+        COLUMN)
+      shape: shape of the variable
+      path: '/' joined parameter path.
+      fallback_to_heuristics: whether to fallback to heuristic factorization
+        rule. For most cases this should be set to `True`.
+    Returns:
+      tuple of: tuple of dimensions to average over, 2-tuple of dimensions to
+      factorize over.
+    """
+    param_ndim = len(shape)
+    if rule is None:
+      # No factorization.
+      return tuple(np.arange(param_ndim)), None
+    if rule is HEURISTIC_RULE:
+      if param_ndim > 2:
+        raise ValueError(
+            f'A parameter with rank strictly higher than 2 must have an '
+            f'explicit factorization rule: {path}, {shape}')
+      # Even if no explicit rule is provided for the param, we still want to
+      # average over all the dimensions for computing the RMS scale.
+      return tuple(np.arange(param_ndim)), HEURISTIC_RULE
+    if len(rule) != param_ndim:
+      raise ValueError(f'Factorization rule {rule} has incorrect rank '
+                       f'for param of rank {param_ndim}: {path}, {shape}')
+    row_dims = tuple(idx for idx, d in enumerate(rule) if d == FactorDim.ROW)
+    col_dims = tuple(idx for idx, d in enumerate(rule) if d == FactorDim.COLUMN)
+    batched_dims = tuple(
+        idx for idx, d in enumerate(rule) if d == FactorDim.BATCH)
+    averaging_dims = tuple(np.delete(np.arange(param_ndim), batched_dims))
+    factor_dims = (row_dims, col_dims)
+    if factor_dims == ((), ()):
+      factor_dims = None
+    if fallback_to_heuristics and param_ndim <= 2 and not batched_dims:
+      logging.warning(
+          'Since rank of parameter %s %d is less than or equal to 2, the '
+          'factorization method falls back to heuristics and the provided '
+          'factor rule %s is ignored.', path, param_ndim, rule)
+      return tuple(np.arange(param_ndim)), HEURISTIC_RULE
+    return averaging_dims, factor_dims
+  def _factored_dims(
+      self, shape: Sequence[int]) -> Optional[Tuple[Tuple[int], Tuple[int]]]:
+    """Whether to use a factored second moment estimator.
+    If there are not two dimensions of size >= min_dim_size_to_factor, then we
+    do not factor. If we do factor the accumulator, then this function returns a
+    tuple of the two largest axes to reduce over.
+    Args:
+      shape: a Shape
+    Returns:
+      None or a tuple of ints
+    """
+    if not self.hyper_params.factored or len(shape) < 2:
+      return None
+    sorted_dims = np.argsort(shape)
+    if shape[sorted_dims[-2]] < self.hyper_params.min_dim_size_to_factor:
+      return None
+    return (int(sorted_dims[-2]),), (int(sorted_dims[-1]),)
+  def init_param_state(self, param, path):
+    shape = param.shape
+    state = {k: jnp.zeros((1,)) for k in ['v_row', 'v_col', 'v', 'm']}
+    if self.hyper_params.factored:
+      factor_rule = (
+          self.hyper_params.factor_map[path]
+          if self.hyper_params.factor_map else HEURISTIC_RULE)
+    else:
+      factor_rule = None
+    _, factored_dims = self._parse_rule(factor_rule, param.shape, path)
+    if factored_dims is HEURISTIC_RULE:
+      factored_dims = self._factored_dims(shape)
+    if factored_dims is not None:
+      d1, d0 = factored_dims
+      vr_shape = np.delete(shape, d0)
+      vc_shape = np.delete(shape, d1)
+      state['v_row'] = jnp.zeros(vr_shape, dtype=jnp.float32)
+      state['v_col'] = jnp.zeros(vc_shape, dtype=jnp.float32)
+    else:
+      state['v'] = jnp.zeros(param.shape, dtype=jnp.float32)
+    if self.hyper_params.beta1 is not None:
+      state['m'] = jnp.zeros(param.shape, dtype=self.dtype_momentum)
+    return _AdafactorParamState(**state)
+  def init_state(self, params):
+    params_flat = utils.flatten_dict_string_keys(params)
+    param_states_flat = [
+        self.init_param_state(param, path)
+        for path, param in params_flat.items()
+    ]
+    param_states_flat = {
+        k: v for k, v in zip(params_flat.keys(), param_states_flat)
+    }
+    param_states = _restore(params, param_states_flat)
+    state = OptimizerState(jnp.asarray(0, dtype=jnp.int32), param_states)
+    return state
+  def apply_param_gradient(self, step, hyper_params, param, state, grad, path):
+    assert hyper_params.learning_rate is not None, 'no learning rate provided.'
+    learning_rate = hyper_params.learning_rate
+    beta1 = hyper_params.beta1
+    decay_rate = hyper_params.decay_rate
+    step_offset = hyper_params.step_offset
+    multiply_by_parameter_scale = hyper_params.multiply_by_parameter_scale
+    max_parameter_scale = hyper_params.max_parameter_scale
+    clipping_threshold = hyper_params.clipping_threshold
+    weight_decay_rate = hyper_params.weight_decay_rate
+    epsilon1 = hyper_params.epsilon1
+    epsilon2 = hyper_params.epsilon2
+    if hyper_params.weight_decay_rate_lr_exponent:
+      weight_decay_rate = (
+          (weight_decay_rate or 1.0) *
+          learning_rate**hyper_params.weight_decay_rate_lr_exponent)
+    if self.hyper_params.factored:
+      factor_rule = (
+          self.hyper_params.factor_map[path]
+          if self.hyper_params.factor_map else HEURISTIC_RULE)
+    else:
+      factor_rule = None
+    averaging_dims, factored_dims = self._parse_rule(factor_rule, param.shape,
+                                                     path)
+    grad = grad.astype(jnp.float32)
+    updates = {k: jnp.zeros((1,)) for k in ['v_row', 'v_col', 'v', 'm']}
+    decay_rate = self._decay_rate_pow(step - step_offset, exponent=decay_rate)
+    update_scale = learning_rate
+    if isinstance(multiply_by_parameter_scale, HParamMap):
+      multiply_by_parameter_scale = multiply_by_parameter_scale[path]
+    if multiply_by_parameter_scale:
+      param_scale = jnp.sqrt(
+          jnp.mean(param * param, axis=averaging_dims, keepdims=True))
+      # Clip param_scale to a minimum value of epsilon2.
+      param_scale = jnp.maximum(param_scale, epsilon2)
+      # Clip param_scale to a maximum value, if specified.
+      if max_parameter_scale is not None:
+        param_scale = jnp.minimum(param_scale, max_parameter_scale)
+      update_scale *= param_scale
+    mixing_rate = 1.0 - decay_rate
+    grad_sqr = grad * grad + epsilon1
+    if factored_dims is HEURISTIC_RULE:
+      factored_dims = self._factored_dims(param.shape)
+    if factored_dims is not None:
+      d1, d0 = factored_dims
+      new_v_row = (
+          decay_rate * state.v_row + mixing_rate * jnp.mean(grad_sqr, axis=d0))
+      new_v_col = (
+          decay_rate * state.v_col + mixing_rate * jnp.mean(grad_sqr, axis=d1))
+      updates['v_row'] = new_v_row
+      updates['v_col'] = new_v_col
+      reduced_d1 = tuple(d - len([e for e in d0 if e < d]) for d in d1)
+      row_col_mean = jnp.mean(new_v_row, axis=reduced_d1, keepdims=True)
+      row_factor = (new_v_row / row_col_mean)**-0.5
+      col_factor = (new_v_col)**-0.5
+      y = (
+          grad * jnp.expand_dims(row_factor, axis=d0) *
+          jnp.expand_dims(col_factor, axis=d1))
+    else:
+      new_v = decay_rate * state.v + mixing_rate * grad_sqr
+      updates['v'] = new_v
+      y = grad * (new_v)**-0.5
+    if clipping_threshold is not None:
+      clipping_denom = (
+          jnp.maximum(
+              1.0,
+              jnp.sqrt(jnp.mean(y * y, axis=averaging_dims, keepdims=True)) /
+              clipping_threshold))
+      y /= clipping_denom
+    subtrahend = update_scale * y
+    if beta1 is not None:
+      new_m = beta1 * state.m + (1.0 - beta1) * subtrahend
+      subtrahend = new_m
+      updates['m'] = new_m.astype(self.dtype_momentum)
+    if weight_decay_rate is not None:
+      new_param = (1.0 - weight_decay_rate) * param - subtrahend
+    else:
+      new_param = param - subtrahend
+    if hyper_params.skip_nan_updates:
+      updates['v_row'] = jnp.where(
+          jnp.isnan(updates['v_row']), state.v_row, updates['v_row'])
+      updates['v_col'] = jnp.where(
+          jnp.isnan(updates['v_col']), state.v_col, updates['v_col'])
+      updates['v'] = jnp.where(jnp.isnan(updates['v']), state.v, updates['v'])
+      updates['m'] = jnp.where(jnp.isnan(updates['m']), state.m, updates['m'])
+      new_param = jnp.where(jnp.isnan(new_param), param, new_param)
+    new_state = _AdafactorParamState(**updates)
+    return new_param.astype(param.dtype), new_state
+  def apply_gradient(self, hyper_params, params, state, grads):
+    """Applies a gradient for a set of parameters.
+    Args:
+      hyper_params: a named tuple of hyper parameters.
+      params: the parameters that should be updated.
+      state: a named tuple containing the state of the optimizer
+      grads: the gradient tensors for the parameters.
+    Returns:
+      A tuple containing the new parameters and the new optimizer state.
+    """
+    step = state.step
+    # We assume that params, param_states, and grads are all dict-like here.
+    params_flat_dict = utils.flatten_dict_string_keys(params)
+    params_paths = params_flat_dict.keys()
+    params_flat = params_flat_dict.values()
+    # extra paranoia to guarantee identical value ordering
+    states_flat = utils.flatten_dict_string_keys(state.param_states)
+    states_flat = [states_flat[k] for k in params_paths]
+    grads_flat = utils.flatten_dict_string_keys(grads)
+    grads_flat = [grads_flat[k] for k in params_paths]
+    if hyper_params.global_norm_clip_threshold:
+      # Paper: http://proceedings.mlr.press/v28/pascanu13.pdf
+      # TF: https://www.tensorflow.org/api_docs/python/tf/clip_by_global_norm
+      squared_l2_norms = [jnp.sum(jnp.square(g)) for g in grads_flat]
+      global_norm = jnp.sqrt(jnp.sum(jnp.array(squared_l2_norms)))
+      scale = hyper_params.global_norm_clip_threshold * jnp.minimum(
+          1.0 / hyper_params.global_norm_clip_threshold, 1.0 / global_norm)
+      grads_flat = [g * scale for g in grads_flat]
+    out = [
+        self.apply_param_gradient(step, hyper_params, param, state, grad, path)
+        for param, state, grad, path in zip(params_flat, states_flat,
+                                            grads_flat, params_paths)
+    ]
+    new_params_flat, new_states_flat = list(zip(*out)) if out else ((), ())
+    new_params_flat = {k: v for k, v in zip(params_paths, new_params_flat)}
+    new_states_flat = {k: v for k, v in zip(params_paths, new_states_flat)}
+    new_params = _restore(params, new_params_flat)
+    new_param_states = _restore(params, new_states_flat)
+    new_state = OptimizerState(step + 1, new_param_states)
+    return new_params, new_state
+  def set_param_axes(self, param_logical_axes):
+    """Sets Adafactor factorization map from logical axis names tree."""
+    logical_factor_rules = self.hyper_params.logical_factor_rules
+    if logical_factor_rules is None:
+      return
+    # pylint:disable=invalid-name
+    NONE = FactorDim.NONE
+    COLUMN = FactorDim.COLUMN
+    ROW = FactorDim.ROW
+    # pylint:enable=invalid-name
+    def apply_rules(axes):
+      # Partially factorized params are marked as unfactorized, preserving
+      # only BATCH axis annotations. We also check for incompletely factorized
+      # params that have ROW, COLUMN but also accidental NONE dimensions and
+      # raise an error in that case.
+      axis_rules = tuple(logical_factor_rules[x] for x in axes)
+      axis_rules = tuple(factor_name_to_factordim(x) for x in axis_rules)
+      if ROW in axis_rules and COLUMN in axis_rules and NONE in axis_rules:
+        raise ValueError(f'Incomplete adafactor spec {axis_rules} for {axes}!')
+      if ROW not in axis_rules or COLUMN not in axis_rules:
+        axis_rules = tuple(
+            NONE if x in (ROW, COLUMN) else x for x in axis_rules)
+      return axis_rules
+    factor_map = jax.tree_map(apply_rules, param_logical_axes)
+    factor_map = utils.flatten_dict_string_keys(factor_map)
+    self.hyper_params = self.hyper_params.replace(factor_map=factor_map)
+  def derive_logical_axes(self, optimizer_state, param_logical_axes):
+    """Derives optimizer logical partitioning from model logical partitions."""
+    optimizer_logical_axes = jax.tree_map(lambda x: None,
+                                          optimizer_state.state_dict())
+    optimizer_logical_axes['target'] = param_logical_axes
+    def factor_rule(logical_axes, adafactor_leaf):
+      return dict(
+          v_row=None,
+          v_col=None,
+          v=logical_axes if adafactor_leaf['v'].shape != (1,) else None,
+          m=logical_axes if self.hyper_params.beta1 else None)
+    optimizer_logical_axes['state']['param_states'] = jax.tree_map(
+        factor_rule, unfreeze(param_logical_axes),
+        optimizer_state.state_dict()['state']['param_states'])
+    return optimizer_state.restore_state(unfreeze(optimizer_logical_axes))

t5x/adafactor_test.py ADDED Viewed

	@@ -0,0 +1,527 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for t5x.adafactor."""
+import functools
+import operator
+from typing import Sequence
+from absl.testing import absltest
+from absl.testing import parameterized
+import flax
+from flax import optim  # used for equivalence testing only
+from flax import traverse_util
+import jax
+from jax import numpy as jnp
+from jax import random
+import numpy as np
+from t5x import adafactor
+from t5x import optimizers
+OptimizerState = optimizers.OptimizerState
+_AdafactorHyperParams = adafactor._AdafactorHyperParams
+_AdafactorParamState = adafactor._AdafactorParamState
+_BATCH = adafactor.FactorDim.BATCH
+_ROW = adafactor.FactorDim.ROW
+_COL = adafactor.FactorDim.COLUMN
+# Testing helpers
+def _assert_numpy_allclose(a, b, atol=None, rtol=None):
+  a, b = jnp.array(a), jnp.array(b)
+  a = a.astype(np.float32) if a.dtype == jnp.bfloat16 else a
+  b = b.astype(np.float32) if b.dtype == jnp.bfloat16 else b
+  kw = {}
+  if atol:
+    kw['atol'] = atol
+  if rtol:
+    kw['rtol'] = rtol
+  np.testing.assert_allclose(a, b, **kw)
+def check_eq(xs, ys, atol=None, rtol=None):
+  xs_leaves, xs_tree = jax.tree_flatten(xs)
+  ys_leaves, ys_tree = jax.tree_flatten(ys)
+  assert xs_tree == ys_tree, f"Tree shapes don't match. \n{xs_tree}\n{ys_tree}"
+  assert jax.tree_util.tree_all(
+      jax.tree_multimap(lambda x, y: np.array(x).shape == np.array(y).shape,
+                        xs_leaves, ys_leaves)), "Leaves' shapes don't match."
+  assert jax.tree_multimap(
+      functools.partial(_assert_numpy_allclose, atol=atol, rtol=rtol),
+      xs_leaves, ys_leaves)
+def flattened_state_dict(x):
+  s = flax.serialization.to_state_dict(x)
+  return flax.traverse_util.flatten_dict(s, sep='/')
+def tree_shape(x):
+  return jax.tree_map(jnp.shape, x)
+def tree_equals(x, y):
+  return jax.tree_util.tree_all(jax.tree_multimap(operator.eq, x, y))
+def _get_multi_adafactor(
+    learning_rate: float, step_offset: int,
+    adafactor_exclude_from_parameter_scale: Sequence[str]
+) -> optim.MultiOptimizer:
+  """Get adafactor with support for excluding some parameters from scaling."""
+  def _should_not_scale(path):
+    return any([s in path for s in adafactor_exclude_from_parameter_scale])
+  scaled_vars = traverse_util.ModelParamTraversal(
+      lambda path, _: not _should_not_scale(path))
+  unscaled_vars = traverse_util.ModelParamTraversal(
+      lambda path, _: _should_not_scale(path))
+  scaled_opt = optim.Adafactor(
+      learning_rate, decay_rate=0.8, step_offset=step_offset)
+  unscaled_opt = optim.Adafactor(
+      learning_rate,
+      decay_rate=0.8,
+      step_offset=step_offset,
+      multiply_by_parameter_scale=False)
+  return optim.MultiOptimizer((scaled_vars, scaled_opt),
+                              (unscaled_vars, unscaled_opt))
+# Inline test data
+MODEL_SHAPE = {
+    'decoder': {
+        'decoder_norm': {'scale': [128]},
+        'layers_0': {
+            'encoder_decoder_attention': {
+                'key': {'kernel': [128, 256]},
+                'out': {'kernel': [256, 128]},
+                'query': {'kernel': [128, 256]},
+                'value': {'kernel': [128, 256]}},
+            'mlp': {
+                'wi': {'kernel': [128, 512]},
+                'wo': {'kernel': [512, 128]}},
+            'pre_cross_attention_layer_norm': {'scale': [128]},
+            'pre_mlp_layer_norm': {'scale': [128]},
+            'pre_self_attention_layer_norm': {'scale': [128]},
+            'self_attention': {
+                'key': {'kernel': [128, 256]},
+                'out': {'kernel': [256, 128]},
+                'query': {'kernel': [128, 256]},
+                'value': {'kernel': [128, 256]}}},
+        'layers_1': {
+            'encoder_decoder_attention': {
+                'key': {'kernel': [128, 128]},
+                'out': {'kernel': [128, 128]},
+                'query': {'kernel': [128, 128]},
+                'value': {'kernel': [128, 128]}},
+            'mlp': {
+                'wi': {'kernel': [128, 512]},
+                'wo': {'kernel': [512, 128]}},
+            'pre_cross_attention_layer_norm': {'scale': [128]},
+            'pre_mlp_layer_norm': {'scale': [128]},
+            'pre_self_attention_layer_norm': {'scale': [128]},
+            'self_attention': {
+                'key': {'kernel': [128, 256]},
+                'out': {'kernel': [256, 128]},
+                'query': {'kernel': [128, 256]},
+                'value': {'kernel': [128, 256]}}},
+        'relpos_bias': {'rel_embedding': [2, 32]}},
+    'encoder': {
+        'encoder_norm': {'scale': [128]},
+        'layers_0': {
+            'attention': {
+                'key': {'kernel': [128, 256]},
+                'out': {'kernel': [256, 128]},
+                'query': {'kernel': [128, 256]},
+                'value': {'kernel': [128, 256]}},
+            'mlp': {
+                'wi': {'kernel': [128, 512]},
+                'wo': {'kernel': [512, 128]}},
+            'pre_attention_layer_norm': {'scale': [128]},
+            'pre_mlp_layer_norm': {'scale': [128]}},
+        'layers_1': {
+            'attention': {
+                'key': {'kernel': [128, 256]},
+                'out': {'kernel': [256, 128]},
+                'query': {'kernel': [128, 256]},
+                'value': {'kernel': [128, 256]}},
+            'mlp': {
+                'wi': {'kernel': [128, 512]},
+                'wo': {'kernel': [512, 128]}},
+            'pre_attention_layer_norm': {'scale': [128]},
+            'pre_mlp_layer_norm': {'scale': [128]}},
+        'relpos_bias': {'rel_embedding': [2, 32]}},
+    'token_embedder': {'embedding': [32128, 128]}}  # pyformat: disable
+class AdafactorTest(parameterized.TestCase):
+  # Classic Adafactor Behavior Tests
+  def test_2D_simple(self):
+    x = {'a': jnp.ones((24, 16))}
+    opt_def = adafactor.Adafactor(min_dim_size_to_factor=8)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (24,), 'a/v_row': (16,)}
+    self.assertTrue(tree_equals(shapes, ref))
+  def test_2D_simple_nofactor(self):
+    x = {'a': jnp.ones((24, 16))}
+    opt_def = adafactor.Adafactor(min_dim_size_to_factor=32)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (24, 16), 'a/v_col': (1,), 'a/v_row': (1,)}
+    self.assertTrue(tree_equals(shapes, ref))
+  def test_2D_simple_nofactor_momentum(self):
+    x = {'a': jnp.ones((24, 16))}
+    opt_def = adafactor.Adafactor(min_dim_size_to_factor=32, beta1=0.1)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (24, 16), 'a/v': (24, 16), 'a/v_col': (1,), 'a/v_row': (1,)}
+    self.assertTrue(tree_equals(shapes, ref))
+  def test_3D_simple(self):
+    x = {'a': jnp.ones((24, 4, 16))}
+    factor_map = adafactor.HParamMap((('a', (_COL, _BATCH, _ROW)),))
+    opt_def = adafactor.Adafactor(
+        min_dim_size_to_factor=8, factor_map=factor_map)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (24, 4), 'a/v_row': (4, 16)}
+    self.assertTrue(tree_equals(shapes, ref))
+  def test_init_state(self):
+    params = {'x': np.zeros((3, 2))}
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1, decay_rate=0.8, beta1=None, min_dim_size_to_factor=0)
+    state = optimizer_def.init_state(params)
+    expected_hyper_params = _AdafactorHyperParams(0.1, True, True, None, 0.8, 0,
+                                                  1.0, None, 0, 1e-30, 1e-3)
+    self.assertEqual(optimizer_def.hyper_params, expected_hyper_params)
+    expected_state = OptimizerState(
+        0, {
+            'x':
+                _AdafactorParamState(
+                    np.zeros((2,)), np.zeros((3,)), np.zeros(
+                        (1,)), np.zeros((1,)))
+        })
+    check_eq(state, expected_state)
+    # unfactorized
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1, decay_rate=0.8, beta1=0.0, min_dim_size_to_factor=32)
+    state = optimizer_def.init_state(params)
+    expected_hyper_params = _AdafactorHyperParams(0.1, True, True, 0.0, 0.8, 0,
+                                                  1.0, None, 32, 1e-30, 1e-3)
+    self.assertEqual(optimizer_def.hyper_params, expected_hyper_params)
+    expected_state = OptimizerState(
+        0, {
+            'x':
+                _AdafactorParamState(
+                    np.zeros((1,)), np.zeros((1,)), np.zeros(
+                        (3, 2)), np.zeros((3, 2)))
+        })
+    check_eq(state, expected_state)
+  def test_apply_gradient(self):
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1, decay_rate=0.8, min_dim_size_to_factor=0)
+    params = {'x': np.ones((3, 2), np.float32)}
+    state = OptimizerState(
+        1, {
+            'x':
+                _AdafactorParamState(
+                    np.array([0.9, 0.9]), np.array([0.1, 0.1, 0.1]),
+                    np.zeros((1,)), np.zeros((1,)))
+        })
+    grads = {'x': np.ones((3, 2), np.float32)}
+    new_params, new_state = optimizer_def.apply_gradient(
+        optimizer_def.hyper_params, params, state, grads)
+    expected_new_state = OptimizerState(
+        2, {
+            'x':
+                _AdafactorParamState(
+                    np.array([0.9574349, 0.9574349]),
+                    np.array([0.6169143, 0.6169143, 0.6169143]), np.zeros(
+                        (1,)), np.zeros((1,)))
+        })
+    expected_new_params = {'x': 0.9 * np.ones((3, 2))}
+    check_eq(new_params, expected_new_params)
+    check_eq(new_state, expected_new_state, rtol=1e-6)
+    # unfactored w momentum
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1, beta1=0.0, decay_rate=0.8, min_dim_size_to_factor=32)
+    params = {'x': np.ones((3, 2), np.float32)}
+    state = OptimizerState(
+        1, {
+            'x':
+                _AdafactorParamState(
+                    np.zeros(1,), np.zeros(1,), 0.5 * np.ones(
+                        (3, 2)), np.zeros((3, 2)))
+        })
+    grads = {'x': np.ones((3, 2), np.float32)}
+    new_params, new_state = optimizer_def.apply_gradient(
+        optimizer_def.hyper_params, params, state, grads)
+    expected_new_params = {'x': 0.9 * np.ones((3, 2))}
+    check_eq(new_params, expected_new_params)
+    expected_new_state = OptimizerState(
+        2, {
+            'x':
+                _AdafactorParamState(
+                    np.array([0.0]), np.array([0.0]), 0.787174 * np.ones(
+                        (3, 2)), 0.1 * np.ones((3, 2)))
+        })
+    check_eq(new_state, expected_new_state, rtol=1e-6)
+  def test_apply_gradient_with_global_norm_clipping(self):
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1,
+        decay_rate=0.8,
+        min_dim_size_to_factor=0,
+        global_norm_clip_threshold=1.0)
+    params = {'x': np.ones((3, 2), np.float32)}
+    state = OptimizerState(
+        1, {
+            'x':
+                _AdafactorParamState(
+                    np.array([0.9, 0.9]), np.array([0.1, 0.1, 0.1]),
+                    np.zeros((1,)), np.zeros((1,)))
+        })
+    grads = {'x': np.ones((3, 2), np.float32)}
+    new_params, new_state = optimizer_def.apply_gradient(
+        optimizer_def.hyper_params, params, state, grads)
+    expected_new_state = OptimizerState(
+        2, {
+            'x':
+                _AdafactorParamState(
+                    np.array([0.478811, 0.478811]),
+                    np.array([0.13829, 0.13829, 0.13829]), np.zeros(
+                        (1,)), np.zeros((1,)))
+        })
+    expected_new_params = {'x': 0.9 * np.ones((3, 2))}
+    check_eq(new_params, expected_new_params)
+    check_eq(new_state, expected_new_state, rtol=1e-6)
+  def test_factorizes(self):
+    params = {'x': np.zeros((64, 64))}
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1,
+        decay_rate=0.8,
+        beta1=None,
+        min_dim_size_to_factor=32)
+    state = optimizer_def.init_state(params)
+    self.assertEqual(state.param_states['x'].v.shape, (1,))
+    self.assertEqual(state.param_states['x'].m.shape, (1,))
+    self.assertEqual(state.param_states['x'].v_row.shape, (64,))
+    self.assertEqual(state.param_states['x'].v_col.shape, (64,))
+    params = {'x': np.zeros((31, 64))}
+    optimizer_def = adafactor.Adafactor(
+        learning_rate=0.1,
+        decay_rate=0.8,
+        beta1=None,
+        min_dim_size_to_factor=32)
+    state = optimizer_def.init_state(params)
+    self.assertEqual(state.param_states['x'].v.shape, (31, 64))
+    self.assertEqual(state.param_states['x'].m.shape, (1,))
+    self.assertEqual(state.param_states['x'].v_row.shape, (1,))
+    self.assertEqual(state.param_states['x'].v_col.shape, (1,))
+  # Manually specified factorization rules tests.
+  @parameterized.parameters(
+      {'rule': (_ROW, _COL)},
+      {'rule': (_COL, _ROW)},
+  )
+  def test_2D_ignore_specified_factor_rule(self, rule):
+    x = {'a': jnp.ones((24, 16))}
+    factor_map = adafactor.HParamMap((('a', rule),))
+    opt_def = adafactor.Adafactor(
+        min_dim_size_to_factor=8, factor_map=factor_map)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    # Since param is 2D, the explicit factor rule should be ignored and falls
+    # back to heuristics where v_row corresponds to the smaller dim.
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (24,), 'a/v_row': (16,)}
+    self.assertTrue(tree_equals(shapes, ref))
+  def test_3D_simple_manual_rules(self):
+    x = {'a': jnp.ones((24, 4, 16))}
+    factor_map = adafactor.HParamMap((('a', (_COL, _BATCH, _ROW)),))
+    opt_def = adafactor.Adafactor(
+        min_dim_size_to_factor=8, factor_map=factor_map)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (24, 4), 'a/v_row': (4, 16)}
+    self.assertTrue(tree_equals(shapes, ref))
+    factor_map = adafactor.HParamMap((('a', (_ROW, _BATCH, _COL)),))
+    opt_def = adafactor.Adafactor(
+        min_dim_size_to_factor=8, factor_map=factor_map)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (4, 16), 'a/v_row': (24, 4)}
+    self.assertTrue(tree_equals(shapes, ref))
+    factor_map = adafactor.HParamMap((('a', (_COL, _ROW, _ROW)),))
+    opt_def = adafactor.Adafactor(
+        min_dim_size_to_factor=8, factor_map=factor_map)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (24,), 'a/v_row': (4, 16)}
+    self.assertTrue(tree_equals(shapes, ref))
+    factor_map = adafactor.HParamMap((('a', (_COL, _COL, _ROW)),))
+    opt_def = adafactor.Adafactor(
+        min_dim_size_to_factor=8, factor_map=factor_map)
+    optimizer = opt_def.create(x)
+    shapes = tree_shape(flattened_state_dict(optimizer.state.param_states))
+    ref = {'a/m': (1,), 'a/v': (1,), 'a/v_col': (24, 4), 'a/v_row': (16,)}
+    self.assertTrue(tree_equals(shapes, ref))
+  def test_standard_factor_rules(self):
+    # one-off test to double-check that we're following the previous
+    # heuristic convention for rows/columns.
+    def test_standard_factor_rules():
+      token_embedding = (_COL, _ROW)
+      attn_qkv = (_ROW, _COL)
+      attn_out = (_COL, _ROW)
+      mlp_in = (_ROW, _COL)
+      mlp_out = (_COL, _ROW)
+      return ((r'_layer_norm/(bias|scale)',
+               None), (r'(encoder|decoder)_norm/(bias|scale)', None),
+              (r'(encoder_decoder_|self_|\b)attention/(query|key|value)/kernel',
+               attn_qkv), (r'(encoder_decoder_|self_|\b)attention/out/kernel',
+                           attn_out), (r'mlp/DenseGeneral_\d+/bias', None),
+              (r'mlp/wi(_\d+)?/kernel', mlp_in), (r'mlp/wo/kernel', mlp_out),
+              (r'\brelpos_bias', None), (r'token_embedder', token_embedding),
+              (r'.*', adafactor.HEURISTIC_RULE))
+    # create fake model parameters
+    k = jax.random.PRNGKey(0)
+    params = jax.tree_map(
+        lambda shape: jax.random.uniform(k, shape),
+        MODEL_SHAPE,
+        is_leaf=lambda x: isinstance(x, list))
+    # make traditional adafactor state with heuristic
+    factor_map1 = adafactor.HParamMap(((r'.*', adafactor.HEURISTIC_RULE),))
+    optimizer_def1 = adafactor.Adafactor(
+        0.1,
+        decay_rate=0.8,
+        step_offset=0,
+        multiply_by_parameter_scale=True,
+        factor_map=factor_map1)
+    optimizer1 = optimizer_def1.create(params)
+    # make traditional adafactor state with explicit rules
+    factor_map2 = adafactor.HParamMap(test_standard_factor_rules())
+    optimizer_def2 = adafactor.Adafactor(
+        0.1,
+        decay_rate=0.8,
+        step_offset=0,
+        multiply_by_parameter_scale=True,
+        factor_map=factor_map2)
+    optimizer2 = optimizer_def2.create(params)
+    # are they the same?
+    check_eq(optimizer1.state.param_states, optimizer2.state.param_states)
+  @parameterized.parameters(
+      {'shape': (64, 64)},
+      {'shape': (64, 132)},
+      {'shape': (132, 64)},
+      {'shape': (132, 132)},
+      {'shape': (132, 140)},
+      {'shape': (140, 132)},
+  )
+  def test_no_factor_map_equivalence(self, shape):
+    k = random.PRNGKey(0)
+    k1, k2 = random.split(k)
+    p = {'a': random.uniform(k1, shape)}
+    g = {'a': random.uniform(k2, shape)}
+    orig_opt = optim.Adafactor(0.1).create(p)
+    new_opt = adafactor.Adafactor(0.1, factor_map=None).create(p)
+    check_eq(orig_opt.state_dict(), new_opt.state_dict())
+    orig_opt1 = orig_opt.apply_gradient(g)
+    new_opt1 = new_opt.apply_gradient(g)
+    check_eq(orig_opt1.state_dict(), new_opt1.state_dict())
+  @parameterized.parameters({
+      'shape': (128, 128),
+      'rule': (_ROW, _COL)
+  }, {
+      'shape': (132, 128),
+      'rule': (_COL, _ROW)
+  }, {
+      'shape': (128, 132),
+      'rule': (_ROW, _COL)
+  })
+  def test_simple_equivalence(self, shape, rule):
+    k = random.PRNGKey(0)
+    k1, k2 = random.split(k)
+    k3, k4 = random.split(k1)
+    k5, k6 = random.split(k2)
+    p = {'a': random.uniform(k3, shape), 'b': random.uniform(k4, shape)}
+    g = {'a': random.uniform(k5, shape), 'b': random.uniform(k6, shape)}
+    orig_opt = optim.Adafactor(0.1).create(p)
+    factor_map = adafactor.HParamMap(
+        rules=((('a'), rule), ('.*', adafactor.HEURISTIC_RULE)))
+    new_opt = adafactor.Adafactor(0.1, factor_map=factor_map).create(p)
+    check_eq(orig_opt.state_dict(), new_opt.state_dict())
+    orig_opt1 = orig_opt.apply_gradient(g)
+    new_opt1 = new_opt.apply_gradient(g)
+    check_eq(orig_opt1.state_dict(), new_opt1.state_dict())
+  @parameterized.parameters({'shape': (64, 64)}, {'shape': (132, 132)})
+  def test_multiply_by_parameter_scale_equivalence(self, shape):
+    # Use large parameter values to magnify the parameter scaling effect.
+    p = {'a': np.random.randn(*shape) * 100, 'b': np.random.randn(*shape) * 100}
+    g = {'a': np.random.randn(*shape), 'b': np.random.randn(*shape)}
+    orig_opt = _get_multi_adafactor(
+        3.0, 0, adafactor_exclude_from_parameter_scale=('a',)).create(p)
+    scaling_map = adafactor.HParamMap([('a', False), ('.*', True)])
+    new_opt = adafactor.Adafactor(
+        3.0, multiply_by_parameter_scale=scaling_map).create(p)
+    check_eq(orig_opt.state_dict(), new_opt.state_dict())
+    orig_opt1 = orig_opt.apply_gradient(g)
+    new_opt1 = new_opt.apply_gradient(g)
+    check_eq(orig_opt1.state_dict(), new_opt1.state_dict())
+  def test_3d_without_factor_map(self):
+    x = {'a': jnp.ones((24, 4, 16))}
+    opt_def = adafactor.Adafactor(factor_map=None)
+    with self.assertRaises(ValueError):
+      _ = opt_def.create(x)
+if __name__ == '__main__':
+  absltest.main()

t5x/checkpoint_importer.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""T5 Checkpoint Importer."""
+import asyncio
+from concurrent.futures import thread
+import re
+from typing import Any, Callable, Mapping, MutableMapping, Optional, Union
+from flax import traverse_util
+import jax
+from jax import numpy as jnp
+import numpy as np
+import orbax.checkpoint
+import tensorflow as tf
+import tensorstore as ts
+# TODO(b/233659813): Cleanup clients depending on t5x.checkpoint_importer for
+# LazyArray. Reconcile divergence in subclass implementation when possible.
+LazyArray = orbax.checkpoint.lazy_array.LazyArray
+# TODO(brianlester): The choice between using a `LazyTreadPoolArray` or a
+# `LazyAwaitableArray` is dependent on if the user provided `get_fn` is blocking
+# or async respectively, if we can detect which it is, we can automatically
+# proxy to the correct subclass. We cannot detect of `get_fn` is a lambda that
+# wraps an async call so this isn't possible yet. Add this dispatch once we are
+# able to detect that, python3.8+ can detect async for partial'ed functions but
+# not lambdas.
+class LazyThreadPoolArray(LazyArray):
+  """Lazily and asynchronously loads an array when the `get_fn` blocks."""
+  # Uses a global threadpool to enable asynchronous loading.
+  executor = thread.ThreadPoolExecutor()
+  def get_async(self) -> asyncio.Future:
+    return asyncio.wrap_future(self.executor.submit(self.get))
+  def get(self) -> np.ndarray:
+    arr = self._get_fn()
+    if arr.dtype != self.dtype:
+      arr = arr.astype(self.dtype)
+    return arr
+class LazyAwaitableArray(LazyArray):
+  """Lazily and asynchronously loads an array when the `get_fn` is async.
+  Note:
+    The synchronous load method `.get` requires the asyncio event loop and
+    calling `.run_until_complete`. This is not supported when the event loop is
+    already running (for example, from inside another async function).
+  Note:
+    Currently, this class has a few helper methods for creating a
+    LazyAwaitableArray when the input could be either an array, or a TensorStore
+    spec. Most people use async code when dealing with TensorStore so the
+    classmethods have been placed here. When someone eventually uses a blocking
+    function to read from TensorStore they can be moved to the LazyArray base
+    class.
+  """
+  def get_async(self) -> asyncio.Future:
+    async def _get_and_cast():
+      # Pytype has a false positive here, where it treats our _get_fn (_read_ts
+      # in this case) as having a return type of `np.ndarray` instead of
+      # wrapping it in an Awaitable. Related to this bug
+      # https://github.com/google/pytype/issues/527
+      arr = await self._get_fn()  # pytype: disable=bad-return-type
+      if arr.dtype != self.dtype:
+        arr = arr.astype(self.dtype)
+      return arr
+    return asyncio.ensure_future(_get_and_cast())
+  def get(self) -> np.ndarray:
+    loop = asyncio.get_event_loop()
+    return loop.run_until_complete(self.get_async())
+  @classmethod
+  def from_tensor_store_spec(
+      cls,
+      ts_spec: ts.Spec,
+      get_fn: Callable[[], np.ndarray],
+      dtype: Optional[jnp.dtype] = None) -> 'LazyAwaitableArray':
+    """Create a LazyAwaitableArray based on a tensorstore.Spec."""
+    ts_spec = ts_spec.to_json()
+    shape = ts_spec['metadata']['shape']
+    if dtype is None:
+      dtype = jnp.dtype(ts_spec['dtype'])
+    else:
+      dtype = jnp.dtype(dtype)
+    # v2 T5X checkpoints use uint16 as the TensorStore datatype and then store
+    # the bfloat16 bytes as in in the 16 bytes uint16 has (no actual cast). When
+    # When reading the dtype from the TensorStore, if we keep the dtype of these
+    # v2 checkpoints as np.uint16 then the _get_fn (which has a possible cast to
+    # support the `restore_dtype` parameter for the checkpointer) will actually
+    # cast the bfloat16 values to uint16, generally resulting in an array of all
+    # zeros. This check avoid the actual cast to uint16 by replacing the dtype.
+    if dtype == np.uint16:
+      dtype = jnp.bfloat16
+    return cls(shape, dtype, get_fn)
+  @classmethod
+  def from_array(cls,
+                 array: np.ndarray,
+                 get_fn: Callable[[], np.ndarray],
+                 dtype: Optional[jnp.dtype] = None) -> 'LazyAwaitableArray':
+    """Create a LazyAwaitableArray based on an array or python number."""
+    if dtype is None:
+      dtype = array.dtype
+    else:
+      dtype = jnp.dtype(dtype)
+    return cls(array.shape, dtype, get_fn)
+  @classmethod
+  def from_tensor_store_spec_or_array(
+      cls,
+      maybe_ts_spec: Union[ts.Spec, np.ndarray],
+      get_fn: Callable[[], np.ndarray],
+      dtype: Optional[jnp.dtype] = None) -> 'LazyAwaitableArray':
+    """Create a LazyAwaitableArray based on an array or a tensorstore.Spec."""
+    if isinstance(maybe_ts_spec, ts.Spec):
+      return cls.from_tensor_store_spec(maybe_ts_spec, get_fn, dtype=dtype)
+    return cls.from_array(maybe_ts_spec, get_fn, dtype=dtype)
+class CheckpointTranslator:
+  """Utility class for defining mapping rules from one flatdict to another.
+  We assume a checkpoint is loaded as a dictionary with flattened keys of the
+  form:  'name0/name1/name2/.../nameN'
+  A rule is added with the 'add' decorator, which takes a regex matching rule
+  and wraps a conversion function, feeding it (opts, key, val, **regex_groups)
+  where opts is a dict containing apply-time keyword options for use by the
+  conversion functions.
+  """
+  def __init__(self):
+    self.rules = []
+  def add(self, pattern):
+    """Adds a new keyval conversion rule.
+    Args:
+      pattern: regex with capture groups for matching given sets of model
+        variables.  We terminate all regexes with '$' to force complete matches.
+    Returns:
+      Translation function decorator for associating with the provided
+      pattern.
+    """
+    def register_translation_fn_decorator(fn):
+      # We force a complete match by adding end-of-string match.
+      self.rules.append((re.compile(pattern + '$'), fn))
+      return fn
+    return register_translation_fn_decorator
+  def apply(self, flatdict, **opts):
+    """Applies rules to a flattened dictionary.
+    Args:
+      flatdict: flat-key dictionary of variables.
+      **opts: additional config options for translation rules supplied at
+        application time.
+    Returns:
+      Checkpoint data with translated key/values in flat-key dict format.
+    """
+    new_dict = {}
+    unmatched = {}
+    for k, v in flatdict.items():
+      matched = False
+      for rule_pat, rule_fn in self.rules:
+        if rule_pat.match(k):
+          groups = rule_pat.match(k).groups()
+          new_k, new_v = rule_fn(opts, k, v, *groups)
+          if new_k is not None:
+            new_dict[new_k] = new_v
+          matched = True
+          break
+      if not matched:
+        unmatched[k] = v
+    # We force every key-value pair in checkpoint to have a rule associated with
+    # it.
+    if unmatched:
+      raise ValueError('Unmapped tensor keys exist: %s' % unmatched)
+    return new_dict
+# Create a translation rule set for importing T5 & T5.1.1 model checkpoints.
+# -----------------------------------------------------------------------------
+t5_importer = CheckpointTranslator()
+# Name mappings.
+SLOT_MAP = {'_slot_vc': 'v_col', '_slot_vr': 'v_row', '_slot_v': 'v'}
+TOWER_MAP = {'transformer': 'decoder'}
+@t5_importer.add(r'global_step')
+def global_step(opts, key, val):
+  del opts, key
+  return 'state/step', val.astype(np.int32).get() if isinstance(
+      val, LazyArray) else val
+@t5_importer.add(r'shared/embedding(\w*)')
+def shared_embeddings(opts, key, val, slot):
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  newkey = f'{prefix}/token_embedder/embedding{suffix}'
+  return newkey, val
+@t5_importer.add(r'(encoder|decoder|transformer)/embedding(\w*)')
+def separate_embeddings(opts, key, val, encdec, slot):
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  encdec = TOWER_MAP.get(encdec, encdec)
+  newkey = f'{prefix}/{encdec}/token_embedder/embedding{suffix}'
+  return newkey, val
+# In the Mesh TensorFlow T5 code, relative_attention_bias always occurs in layer
+# 0 because SelfAttention precedes other sublayers within the same block.
+@t5_importer.add(
+    r'(encoder|decoder|transformer)/block_(\d+)/layer_000/SelfAttention/relative_attention_bias(\w*)'
+)
+def rel_embeddings(opts, key, val, encdec, blocknum, slot):
+  """Process relpos bias assuming that they are not shared across layers."""
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  blocknum = int(blocknum)
+  encdec = TOWER_MAP.get(encdec, encdec)
+  # At this point, we can't determine whether the relpos bias was shared across
+  # layers or not. We first assume that it was not shared. During post
+  # processing, we remove the layers_0 scope if it was shared.
+  newkey = f'{prefix}/{encdec}/layers_{blocknum}/relpos_bias/rel_embedding{suffix}'
+  return newkey, val
+@t5_importer.add(
+    r'(encoder|decoder|transformer)/block_(\d+)/layer_\d+/(SelfAttention|EncDecAttention)/(q|k|v|o)(\w*)'
+)
+def attention_layers(opts, key, val, encdec, blocknum, attntype, qkvo, slot):
+  """Process attention layers."""
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  blocknum = int(blocknum)
+  encdec = TOWER_MAP.get(encdec, encdec)
+  matrix = {'q': 'query', 'k': 'key', 'v': 'value', 'o': 'out'}[qkvo]
+  if encdec == 'encoder':
+    attntype = 'attention'
+  else:
+    attntype = {
+        'SelfAttention': 'self_attention',
+        'EncDecAttention': 'encoder_decoder_attention'
+    }[attntype]
+  newkey = f'{prefix}/{encdec}/layers_{blocknum}/{attntype}/{matrix}/kernel{suffix}'
+  return newkey, val
+@t5_importer.add(
+    r'(encoder|decoder|transformer)/block_(\d+)/layer_\d+/DenseReluDense/(wi|wo)(?:_(\d+))?/kernel(\w*)'
+)
+def mlpblock(opts, key, val, encdec, blocknum, io_name, io_num, slot):
+  """Process MLP blocks."""
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  blocknum = int(blocknum)
+  encdec = TOWER_MAP.get(encdec, encdec)
+  io_num = f'_{io_num}' if io_num else ''
+  newkey = f'{prefix}/{encdec}/layers_{blocknum}/mlp/{io_name}{io_num}/kernel{suffix}'
+  return newkey, val
+@t5_importer.add(
+    r'(encoder|decoder|transformer)/block_(\d+)/layer_(\d+)/(?:layer|rms)_norm/scale(\w*)'
+)
+def layernorms(opts, key, val, encdec, blocknum, lyrnum, slot):
+  """Process layer norms assuming that they are pre-layernorms."""
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  lyrnum = int(lyrnum)
+  if encdec == 'transformer':
+    layernorm_type = ['pre_self_attention_layer_norm',
+                      'pre_mlp_layer_norm'][lyrnum]
+  elif encdec == 'encoder':
+    layernorm_type = ['pre_attention_layer_norm', 'pre_mlp_layer_norm'][lyrnum]
+  else:  # decoder
+    layernorm_type = [
+        'pre_self_attention_layer_norm', 'pre_cross_attention_layer_norm',
+        'pre_mlp_layer_norm'
+    ][lyrnum]
+  encdec = TOWER_MAP.get(encdec, encdec)
+  newkey = f'{prefix}/{encdec}/layers_{int(blocknum)}/{layernorm_type}/scale{suffix}'
+  return newkey, val
+@t5_importer.add(
+    r'(encoder|decoder|transformer)/(?:final_layer|rms)_norm/scale(\w*)')
+def final_layernorms(opts, key, val, encdec, slot):
+  """Process final layer norms."""
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  norm = {
+      'encoder': 'encoder_norm',
+      'decoder': 'decoder_norm',
+      'transformer': 'decoder_norm'
+  }[encdec]
+  encdec = TOWER_MAP.get(encdec, encdec)
+  newkey = f'{prefix}/{encdec}/{norm}/scale{suffix}'
+  return newkey, val
+@t5_importer.add(r'(?:decoder|transformer)/logits/kernel(\w*)')
+def final_logits(opts, key, val, slot):
+  del opts, key
+  prefix = 'state/param_states' if slot else 'target'
+  suffix = '/' + SLOT_MAP[slot] if slot else ''
+  newkey = f'{prefix}/decoder/logits_dense/kernel{suffix}'
+  return newkey, val
+def _add_missing_param_states(t5_data):
+  """Add dummy slots that Flax Adafactor requires but TF does not."""
+  updates = {}
+  for k in t5_data:
+    if k.startswith('target'):
+      state_leaf = 'state/param_states' + k[len('target'):]
+      updates[state_leaf + '/m'] = np.zeros((1,), np.float32)
+      if state_leaf + '/v' in t5_data:
+        updates[state_leaf + '/v_row'] = np.zeros((1,), np.float32)
+        updates[state_leaf + '/v_col'] = np.zeros((1,), np.float32)
+      elif state_leaf + '/v_row' in t5_data:
+        updates[state_leaf + '/v'] = np.zeros((1,), np.float32)
+  t5_data.update(**updates)
+  return t5_data
+def _maybe_correct_relpos_bias(t5_data):
+  """Correct the relpos_bias format if it is shared across layers."""
+  max_layer_ind = 0
+  for k, v in t5_data.items():
+    match = re.search(r'layers_(\d+)/relpos_bias', k)
+    if match:
+      layer_ind = int(match.groups()[0])
+      max_layer_ind = max(max_layer_ind, layer_ind)
+  modified_dict = {}
+  if max_layer_ind == 0:
+    # Relative position biases are shared across layers
+    for k, v in t5_data.items():
+      new_k = re.sub(r'layers_\d+/relpos_bias', 'relpos_bias', k)
+      modified_dict[new_k] = v
+  else:
+    # Relative position biases are unique in each layer. No more processing is
+    # necessary.
+    modified_dict = t5_data
+  return modified_dict
+# Load checkpoint, translate, and update flax optimizer and model.
+# -----------------------------------------------------------------------------
+def load_tf_ckpt(path):
+  """Load a TF checkpoint as a flat dictionary of numpy arrays."""
+  ckpt_reader = tf.train.load_checkpoint(path)
+  ckpt_shape_map = ckpt_reader.get_variable_to_shape_map()
+  ckpt_dtype_map = ckpt_reader.get_variable_to_dtype_map()
+  datamap = {  # pylint: disable=g-complex-comprehension
+      k: LazyThreadPoolArray(
+          s,
+          jnp.dtype(ckpt_dtype_map[k].as_numpy_dtype),
+          lambda x=k: ckpt_reader.get_tensor(x))
+      for k, s in ckpt_shape_map.items()
+  }
+  return datamap
+def _update_state_dict(state_dict: Mapping[str, Any],
+                       t5_data: MutableMapping[str, LazyArray],
+                       strict: bool = True) -> Mapping[str, Any]:
+  """Update flax optimizer for T5 model.
+  Args:
+    state_dict: Optimizer to update with T5 parameters.
+    t5_data: T5 model parameters, typically loaded from a checkpoint.
+    strict: If True requires that optimizer and t5_data mappings contain the
+      same set of names (variables). If False, updating will succeed even if
+      t5_data contains variables not in the optimizer. If the optimizer has
+      variables not in t5_data, this function will still fail.
+  Returns:
+    Updated optimizer.
+  """
+  flat_state_dict = traverse_util.flatten_dict(state_dict, sep='/')
+  # Remove parameters from the checkpoint not found in the optimizer (this
+  # allows us to load checkpoints that contain more parameters than our current
+  # model).
+  if not strict:
+    for k in list(t5_data):
+      if k not in flat_state_dict:
+        t5_data.pop(k)
+  # Shape check.
+  for k, v in t5_data.items():
+    if flat_state_dict[k].shape != v.shape:
+      raise ValueError(
+          f'Variable {k} has shape {v.shape} != {flat_state_dict[k].shape}')
+  flat_state_dict = t5_data
+  state_dict = traverse_util.unflatten_dict(
+      {tuple(k.split('/')): v for k, v in flat_state_dict.items()})
+  return state_dict
+def restore_from_t5_checkpoint(
+    state_dict: Mapping[str, Any],
+    path: str,
+    lazy_parameters: bool = False,
+    strict: bool = True,
+    translator: Optional[CheckpointTranslator] = None) -> Mapping[str, Any]:
+  """Load T5 checkpoint and update Adafactor optimizer and T5 model from it.
+  We require that the final translated checkpoint structure exactly matches
+  that of the Flax Adafactor + Transformer data, up to shape agreement of
+  the leaves.
+  Args:
+    state_dict: Flax Adafactor Optimizer for T5 transformer encoder-decoder.
+    path: a path to checkpoint file or directory.
+    lazy_parameters: whether to leave the parameters as LazyArrays to preserve
+      memory.
+    strict: If True requires that optimizer and t5_data mappings contain the
+      same set of names (variables). If False, updating will succeed even if
+      t5_data contains variables not in the optimizer. If the optimizer has
+      variables not in t5_data, this function will still fail.
+    translator: The mapping rules for conversion. If None, then default T5
+      conversion rules will be used.
+  Returns:
+    Adafactor optimizer updated with parameters and optimizer state from
+    T5 checkpoint.
+  """
+  if translator is None:
+    translator = t5_importer
+  ckpt_data = load_tf_ckpt(path)
+  t5_data = translator.apply(ckpt_data)
+  t5_data = _add_missing_param_states(t5_data)
+  t5_data = _maybe_correct_relpos_bias(t5_data)
+  state_dict = _update_state_dict(state_dict, t5_data, strict=strict)
+  if not lazy_parameters:
+    state_dict = jax.tree_map(
+        lambda x: x.get() if isinstance(x, LazyArray) else x, state_dict)
+  return state_dict

t5x/checkpoint_importer_test.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for t5x.checkpoint_importer."""
+import json
+import os
+from absl import flags
+from absl.testing import absltest
+import jax
+import numpy as np
+from t5x import checkpoint_importer
+import tensorflow as tf
+class CheckpointImporterTest(absltest.TestCase):
+  def test_rel_embeddings_shared_layers(self):
+    # This represents a ckpt where the Mesh TensorFlow's
+    # transformer_layers.SelfAttention.relative_attention_type = "bias_shared",
+    # i.e., the same relative attention parameters are shared by all layers
+    # within the (en|de)coder.
+    ckpt_data = {
+        'encoder/block_000/layer_000/SelfAttention/relative_attention_bias':
+            1,
+        'decoder/block_000/layer_000/SelfAttention/relative_attention_bias':
+            2,
+        'decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v':
+            3,
+    }
+    t5_data = checkpoint_importer.t5_importer.apply(ckpt_data)
+    t5_data = checkpoint_importer._maybe_correct_relpos_bias(t5_data)
+    expected = {
+        'target/encoder/relpos_bias/rel_embedding': 1,
+        'target/decoder/relpos_bias/rel_embedding': 2,
+        'state/param_states/decoder/relpos_bias/rel_embedding/v': 3,
+    }
+    self.assertEqual(t5_data, expected)
+  def test_rel_embeddings_per_layer(self):
+    # This represents a ckpt where the Mesh TensorFlow's
+    # transformer_layers.SelfAttention.relative_attention_type = "bias", i.e.,
+    # each layer has its own relative attention parameters.
+    ckpt_data = {
+        'encoder/block_000/layer_000/SelfAttention/relative_attention_bias':
+            1,
+        'encoder/block_001/layer_000/SelfAttention/relative_attention_bias':
+            2,
+        'decoder/block_000/layer_000/SelfAttention/relative_attention_bias':
+            3,
+        'decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v':
+            4,
+        'decoder/block_011/layer_000/SelfAttention/relative_attention_bias':
+            5
+    }
+    t5_data = checkpoint_importer.t5_importer.apply(ckpt_data)
+    t5_data = checkpoint_importer._maybe_correct_relpos_bias(t5_data)
+    expected = {
+        'target/encoder/layers_0/relpos_bias/rel_embedding': 1,
+        'target/encoder/layers_1/relpos_bias/rel_embedding': 2,
+        'target/decoder/layers_0/relpos_bias/rel_embedding': 3,
+        'state/param_states/decoder/layers_0/relpos_bias/rel_embedding/v': 4,
+        'target/decoder/layers_11/relpos_bias/rel_embedding': 5,
+    }
+    self.assertEqual(t5_data, expected)
+if __name__ == '__main__':
+  absltest.main()

t5x/checkpoint_utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Checkpoint helper functions for managing checkpoints.
+Supports marking checkpoints as pinned to exclude them from the checkpointer
+removal process.
+"""
+import os
+from absl import logging
+from tensorflow.io import gfile
+# PINNED file in the checkpoint directory indicates that the checkpoint should
+# not be removed during the automatic pruning of old checkpoints.
+_PINNED_CHECKPOINT_FILENAME = 'PINNED'
+def pinned_checkpoint_filepath(ckpt_dir: str) -> str:
+  """Full path of the pinned checkpoint file."""
+  return os.path.join(ckpt_dir, _PINNED_CHECKPOINT_FILENAME)
+def is_pinned_checkpoint(ckpt_dir: str) -> bool:
+  """Returns whether the checkpoint is pinned, and should NOT be removed."""
+  pinned_ckpt_file = pinned_checkpoint_filepath(ckpt_dir)
+  if gfile.exists(pinned_ckpt_file):
+    return True
+  return False
+def pin_checkpoint(ckpt_dir: str, txt: str = '1') -> None:
+  """Pin a checkpoint so it does not get deleted by the normal pruning process.
+  Creates a PINNED file in the checkpoint directory to indicate the checkpoint
+  should be excluded from the deletion of old checkpoints.
+  Args:
+    ckpt_dir: The checkpoint step dir that is to be always kept.
+    txt: Text to be written into the checkpoints ALWAYS_KEEP me file.
+  """
+  pinned_ckpt_file = pinned_checkpoint_filepath(ckpt_dir)
+  with gfile.GFile(pinned_ckpt_file, 'w') as f:
+    logging.debug('Write %s file : %s.', pinned_ckpt_file, txt)
+    f.write(txt)
+def unpin_checkpoint(ckpt_dir: str) -> None:
+  """Removes the pinned status of the checkpoint so it is open for deletion."""
+  if not is_pinned_checkpoint(ckpt_dir):
+    logging.debug('%s is not PINNED. Nothing to do here.', ckpt_dir)
+    return
+  try:
+    pinned_ckpt_file = pinned_checkpoint_filepath(ckpt_dir)
+    logging.debug('Remove %s file.', pinned_ckpt_file)
+    gfile.rmtree(pinned_ckpt_file)
+  except IOError:
+    logging.exception('Failed to unpin %s', ckpt_dir)
+def remove_checkpoint_dir(ckpt_dir: str) -> None:
+  """Removes the checkpoint dir if it is not pinned."""
+  if not is_pinned_checkpoint(ckpt_dir):
+    logging.info('Deleting checkpoint: %s', ckpt_dir)
+    gfile.rmtree(ckpt_dir)
+  else:
+    logging.info('Keeping pinned checkpoint: %s', ckpt_dir)
+def remove_dataset_checkpoint(ckpt_dir: str, train_ds_prefix: str) -> None:
+  """Removes dataset checkpoints if the checkpoint is not pinned."""
+  if not is_pinned_checkpoint(ckpt_dir):
+    train_ds_pattern = os.path.join(ckpt_dir, train_ds_prefix + '*')
+    logging.info('Deleting dataset checkpoint: %s', train_ds_pattern)
+    for file in gfile.glob(train_ds_pattern):
+      gfile.remove(file)
+  else:
+    logging.info('Keeping pinned checkpoint: %s', ckpt_dir)

t5x/checkpoint_utils_test.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for t5x.checkpoint_utils."""
+import os
+import traceback
+from absl.testing import absltest
+from t5x import checkpoint_utils
+from tensorflow.io import gfile
+TESTDATA = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+class CheckpointsUtilsTest(absltest.TestCase):
+  def setUp(self):
+    super().setUp()
+    self.checkpoints_dir = self.create_tempdir()
+    self.ckpt_dir_path = self.checkpoints_dir.full_path
+    self.pinned_ckpt_file = os.path.join(self.ckpt_dir_path, "PINNED")
+    self.checkpoints_dir.create_file("checkpoint")
+    # Create a `train_ds` file representing the dataset checkpoint.
+    train_ds_basename = "train_ds-00000-of-00001"
+    self.train_ds_file = os.path.join(self.ckpt_dir_path, train_ds_basename)
+    self.checkpoints_dir.create_file(train_ds_basename)
+  def test_always_keep_checkpoint_file(self):
+    self.assertEqual(
+        "/path/to/ckpt/dir/PINNED",
+        checkpoint_utils.pinned_checkpoint_filepath("/path/to/ckpt/dir"))
+  def test_is_pinned_checkpoint_false_by_default(self):
+    # Ensure regular checkpoint without PINNED file.
+    self.assertFalse(gfile.exists(os.path.join(self.ckpt_dir_path, "PINNED")))
+    # Validate checkpoints are not pinned by default.
+    self.assertFalse(checkpoint_utils.is_pinned_checkpoint(self.ckpt_dir_path))
+  def test_is_pinned_checkpoint(self):
+    # Ensure the checkpoint directory as pinned.
+    pinned_ckpt_testdata = os.path.join(TESTDATA, "pinned_ckpt_dir")
+    pinned_file = os.path.join(pinned_ckpt_testdata, "PINNED")
+    self.assertTrue(gfile.exists(pinned_file))
+    # Test and validate.
+    self.assertTrue(checkpoint_utils.is_pinned_checkpoint(pinned_ckpt_testdata))
+  def test_is_pinned_missing_ckpt(self):
+    self.assertFalse(
+        checkpoint_utils.is_pinned_checkpoint(
+            os.path.join(self.ckpt_dir_path, "ckpt_does_not_exist")))
+  def test_pin_checkpoint(self):
+    # Ensure directory isn't already pinned.
+    self.assertFalse(gfile.exists(self.pinned_ckpt_file))
+    # Test.
+    checkpoint_utils.pin_checkpoint(self.ckpt_dir_path)
+    # Validate.
+    self.assertTrue(gfile.exists(self.pinned_ckpt_file))
+    with open(self.pinned_ckpt_file) as f:
+      self.assertEqual("1", f.read())
+  def test_pin_checkpoint_txt(self):
+    checkpoint_utils.pin_checkpoint(self.ckpt_dir_path, "TEXT_IN_PINNED")
+    self.assertTrue(os.path.exists(os.path.join(self.ckpt_dir_path, "PINNED")))
+    with open(self.pinned_ckpt_file) as f:
+      self.assertEqual("TEXT_IN_PINNED", f.read())
+  def test_unpin_checkpoint(self):
+    # Mark the checkpoint directory as pinned.
+    self.checkpoints_dir.create_file("PINNED")
+    self.assertTrue(checkpoint_utils.is_pinned_checkpoint(self.ckpt_dir_path))
+    # Test.
+    checkpoint_utils.unpin_checkpoint(self.ckpt_dir_path)
+    # Validate the "PINNED" checkpoint file got removed.
+    self.assertFalse(gfile.exists(os.path.join(self.ckpt_dir_path, "PINNED")))
+  def test_unpin_checkpoint_does_not_exist(self):
+    missing_ckpt_path = os.path.join(self.ckpt_dir_path, "ckpt_does_not_exist")
+    self.assertFalse(gfile.exists(missing_ckpt_path))
+    # Test. Assert does not raise error.
+    try:
+      checkpoint_utils.unpin_checkpoint(missing_ckpt_path)
+    except IOError:
+      # TODO(b/172262005): Remove traceback.format_exc() from the error message.
+      self.fail("Unpin checkpoint failed with: %s" % traceback.format_exc())
+  def test_remove_checkpoint_dir(self):
+    # Ensure the checkpoint directory is setup.
+    assert gfile.exists(self.ckpt_dir_path)
+    # Test.
+    checkpoint_utils.remove_checkpoint_dir(self.ckpt_dir_path)
+    # Validate the checkpoint directory got removed.
+    self.assertFalse(gfile.exists(self.ckpt_dir_path))
+  def test_remove_checkpoint_dir_pinned(self):
+    # Mark the checkpoint directory as pinned so it does not get removed.
+    self.checkpoints_dir.create_file("PINNED")
+    # Test.
+    checkpoint_utils.remove_checkpoint_dir(self.ckpt_dir_path)
+    # Validate the checkpoint directory still exists.
+    self.assertTrue(gfile.exists(self.ckpt_dir_path))
+  def test_remove_dataset_checkpoint(self):
+    # Ensure the checkpoint directory is setup.
+    assert gfile.exists(self.ckpt_dir_path)
+    # Test.
+    checkpoint_utils.remove_dataset_checkpoint(self.ckpt_dir_path, "train_ds")
+    # Validate the checkpoint directory got removed.
+    self.assertFalse(gfile.exists(self.train_ds_file))
+    self.assertTrue(gfile.exists(self.ckpt_dir_path))
+  def test_remove_dataset_checkpoint_pinned(self):
+    # Mark the checkpoint directory as pinned so it does not get removed.
+    self.checkpoints_dir.create_file("PINNED")
+    # Test.
+    checkpoint_utils.remove_dataset_checkpoint(self.ckpt_dir_path, "train_ds")
+    # Validate the checkpoint directory still exists.
+    self.assertTrue(gfile.exists(self.train_ds_file))
+    self.assertTrue(gfile.exists(self.ckpt_dir_path))
+if __name__ == "__main__":
+  absltest.main()