diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4b34d62ea594d97cf0191a52d12050ce61dddd62 --- /dev/null +++ b/.gitignore @@ -0,0 +1,131 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea/* diff --git a/.models/.gitattributes b/.models/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..ec4a626fbb7799f6a25b45fb86344b2bf7b37e64 --- /dev/null +++ b/.models/.gitattributes @@ -0,0 +1 @@ +*.pth filter=lfs diff=lfs merge=lfs -text diff --git a/.models/autoregressive.pth b/.models/autoregressive.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a2459b466008daaf2daa0477056dfc59232a39c --- /dev/null +++ b/.models/autoregressive.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c0c801fc6a7841d87e055e42c254c1f27703647af719d78f9e1002e90386db +size 1716988501 diff --git a/.models/classifier.pth b/.models/classifier.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d2a4de92f2739d340bd9f2f21e5758c9ff38562 --- /dev/null +++ b/.models/classifier.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ab946010be0a963b5039e8fca74bbb8a6eebcf366c761db21ae7e94cd6ada3 +size 60938957 diff --git a/.models/clvp2.pth b/.models/clvp2.pth new file mode 100644 index 0000000000000000000000000000000000000000..20889175364b1f45c31b2bd90ace1fa3e3e56c96 --- /dev/null +++ b/.models/clvp2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6097e708cf692eb93bd770880660953935e87e8995eb864819bbe51b7d91342c +size 975620731 diff --git a/.models/cvvp.pth b/.models/cvvp.pth new file mode 100644 index 0000000000000000000000000000000000000000..f79ec235416408adefb20513303fe72f905e6a69 --- /dev/null +++ b/.models/cvvp.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d050e32592ad4a318e03a4f99b09c9c26baf68d78a9d7503ff2bc3883e897100 +size 151223901 diff --git a/.models/diffusion_decoder.pth b/.models/diffusion_decoder.pth new file mode 100644 index 0000000000000000000000000000000000000000..8022931cbba56260c1c1b79a16fc0596bc5316fd --- /dev/null +++ b/.models/diffusion_decoder.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e089edfbbd73a7a23180068c86676400ff83809651070581302ef500d4bca99 +size 1169472627 diff --git a/.models/rlg_auto.pth b/.models/rlg_auto.pth new file mode 100644 index 0000000000000000000000000000000000000000..63c7b503388d11b0c265d12ce257945bb408c642 --- /dev/null +++ b/.models/rlg_auto.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4473c125482e2a3322a5ea762025a0c6ec657955c3002cf099c0635d79967551 +size 25193729 diff --git a/.models/rlg_diffuser.pth b/.models/rlg_diffuser.pth new file mode 100644 index 0000000000000000000000000000000000000000..78b6d713fe2ad526675d18f0f8f326f574ac798e --- /dev/null +++ b/.models/rlg_diffuser.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e84b1ce60631c56dc8dec3d27c131993dd99d3060e7919cc351857457dbfdac +size 100715777 diff --git a/.models/vocoder.pth b/.models/vocoder.pth new file mode 100644 index 0000000000000000000000000000000000000000..d7e52501e80b7bcb058be79b18cf73243d71a848 --- /dev/null +++ b/.models/vocoder.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e8153e9f8ffb00b116f7f67833df2802fcf81e6bc173acc3b3b4bf9f04189d +size 391384715 diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..e155703e1460a49886d2e3dfda1962993f07184b --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,10 @@ +cff-version: 1.3.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Betker" + given-names: "James" + orcid: "https://orcid.org/my-orcid?orcid=0000-0003-3259-4862" +title: "TorToiSe text-to-speech" +version: 2.0 +date-released: 2022-04-28 +url: "https://github.com/neonbjb/tortoise-tts" \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..15f8d32d1bff9bac03e9fc423d91b5a043b6c723 --- /dev/null +++ b/README.md @@ -0,0 +1,260 @@ +# TorToiSe + +Tortoise is a text-to-speech program built with the following priorities: + +1. Strong multi-voice capabilities. +2. Highly realistic prosody and intonation. + +This repo contains all the code needed to run Tortoise TTS in inference mode. + +### New features + +#### v2.1; 2022/5/2 +- Added ability to produce totally random voices. +- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent. +- Added ability to use your own pretrained models. +- Refactored directory structures. +- Performance improvements & bug fixes. + +## What's in a name? + +I'm naming my speech-related repos after Mojave desert flora and fauna. Tortoise is a bit tongue in cheek: this model +is insanely slow. It leverages both an autoregressive decoder **and** a diffusion decoder; both known for their low +sampling rates. On a K80, expect to generate a medium sized sentence every 2 minutes. + +## Demos + +See [this page](http://nonint.com/static/tortoise_v2_examples.html) for a large list of example outputs. + +## Usage guide + +### Colab + +Colab is the easiest way to try this out. I've put together a notebook you can use here: +https://colab.research.google.com/drive/1wVVqUPqwiDBUVeWWOUNglpGhU3hg_cbR?usp=sharing + +### Installation + +If you want to use this on your own computer, you must have an NVIDIA GPU. First, install pytorch using these +instructions: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) + +Then: + +```shell +git clone https://github.com/neonbjb/tortoise-tts.git +cd tortoise-tts +python setup.py install +``` + +### do_tts.py + +This script allows you to speak a single phrase with one or more voices. +```shell +python tortoise/do_tts.py --text "I'm going to speak this" --voice random --preset fast +``` + +### read.py + +This script provides tools for reading large amounts of text. + +```shell +python tortoise/read.py --textfile --voice random +``` + +This will break up the textfile into sentences, and then convert them to speech one at a time. It will output a series +of spoken clips as they are generated. Once all the clips are generated, it will combine them into a single file and +output that as well. + +Sometimes Tortoise screws up an output. You can re-generate any bad clips by re-running `read.py` with the --regenerate +argument. + +### API + +Tortoise can be used programmatically, like so: + +```python +reference_clips = [utils.audio.load_audio(p, 22050) for p in clips_paths] +tts = api.TextToSpeech() +pcm_audio = tts.tts_with_preset("your text here", reference_clips, preset='fast') +``` + +## Voice customization guide + +Tortoise was specifically trained to be a multi-speaker model. It accomplishes this by consulting reference clips. + +These reference clips are recordings of a speaker that you provide to guide speech generation. These clips are used to determine many properties of the output, such as the pitch and tone of the voice, speaking speed, and even speaking defects like a lisp or stuttering. The reference clip is also used to determine non-voice related aspects of the audio output like volume, background noise, recording quality and reverb. + +### Random voice + +I've included a feature which randomly generates a voice. These voices don't actually exist and will be random every time you run +it. The results are quite fascinating and I recommend you play around with it! + +You can use the random voice by passing in 'random' as the voice name. Tortoise will take care of the rest. + +For the those in the ML space: this is created by projecting a random vector onto the voice conditioning latent space. + +### Provided voices + +This repo comes with several pre-packaged voices. You will be familiar with many of them. :) + +Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set +produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the +training set. + +### Adding a new voice + +To add new voices to Tortoise, you will need to do the following: + +1. Gather audio clips of your speaker(s). Good sources are YouTube interviews (you can use youtube-dl to fetch the audio), audiobooks or podcasts. Guidelines for good clips are in the next section. +2. Cut your clips into ~10 second segments. You want at least 3 clips. More is better, but I only experimented with up to 5 in my testing. +3. Save the clips as a WAV file with floating point format and a 22,050 sample rate. +4. Create a subdirectory in voices/ +5. Put your clips in that subdirectory. +6. Run tortoise utilities with --voice=. + +### Picking good reference clips + +As mentioned above, your reference clips have a profound impact on the output of Tortoise. Following are some tips for picking +good clips: + +1. Avoid clips with background music, noise or reverb. These clips were removed from the training dataset. Tortoise is unlikely to do well with them. +2. Avoid speeches. These generally have distortion caused by the amplification system. +3. Avoid clips from phone calls. +4. Avoid clips that have excessive stuttering, stammering or words like "uh" or "like" in them. +5. Try to find clips that are spoken in such a way as you wish your output to sound like. For example, if you want to hear your target voice read an audiobook, try to find clips of them reading a book. +6. The text being spoken in the clips does not matter, but diverse text does seem to perform better. + +## Advanced Usage + +### Generation settings + +Tortoise is primarily an autoregressive decoder model combined with a diffusion model. Both of these have a lot of knobs +that can be turned that I've abstracted away for the sake of ease of use. I did this by generating thousands of clips using +various permutations of the settings and using a metric for voice realism and intelligibility to measure their effects. I've +set the defaults to the best overall settings I was able to find. For specific use-cases, it might be effective to play with +these settings (and it's very likely that I missed something!) + +These settings are not available in the normal scripts packaged with Tortoise. They are available, however, in the API. See +```api.tts``` for a full list. + +### Prompt engineering + +Some people have discovered that it is possible to do prompt engineering with Tortoise! For example, you can evoke emotion +by including things like "I am really sad," before your text. I've built an automated redaction system that you can use to +take advantage of this. It works by attempting to redact any text in the prompt surrounded by brackets. For example, the +prompt "\[I am really sad,\] Please feed me." will only speak the words "Please feed me" (with a sad tonality). + +### Playing with the voice latent + +Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent, +then taking the mean of all of the produced latents. The experimentation I have done has indicated that these point latents +are quite expressive, affecting everything from tone to speaking rate to speech abnormalities. + +This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output +what it thinks the "average" of those two voices sounds like. + +#### Generating conditioning latents from voices + +Use the script `get_conditioning_latents.py` to extract conditioning latents for a voice you have installed. This script +will dump the latents to a .pth pickle file. The file will contain a single tuple, (autoregressive_latent, diffusion_latent). + +Alternatively, use the api.TextToSpeech.get_conditioning_latents() to fetch the latents. + +#### Using raw conditioning latents to generate speech + +After you've played with them, you can use them to generate speech by creating a subdirectory in voices/ with a single +".pth" file containing the pickled conditioning latents as a tuple (autoregressive_latent, diffusion_latent). + +### Send me feedback! + +Probabilistic models like Tortoise are best thought of as an "augmented search" - in this case, through the space of possible +utterances of a specific string of text. The impact of community involvement in perusing these spaces (such as is being done with +GPT-3 or CLIP) has really surprised me. If you find something neat that you can do with Tortoise that isn't documented here, +please report it to me! I would be glad to publish it to this page. + +## Tortoise-detect + +Out of concerns that this model might be misused, I've built a classifier that tells the likelihood that an audio clip +came from Tortoise. + +This classifier can be run on any computer, usage is as follows: + +```commandline +python tortoise/is_this_from_tortoise.py --clip= +``` + +This model has 100% accuracy on the contents of the results/ and voices/ folders in this repo. Still, treat this classifier +as a "strong signal". Classifiers can be fooled and it is likewise not impossible for this classifier to exhibit false +positives. + +## Model architecture + +Tortoise TTS is inspired by OpenAI's DALLE, applied to speech data and using a better decoder. It is made up of 5 separate +models that work together. I've assembled a write-up of the system architecture here: +[https://nonint.com/2022/04/25/tortoise-architectural-design-doc/](https://nonint.com/2022/04/25/tortoise-architectural-design-doc/) + +## Training + +These models were trained on my "homelab" server with 8 RTX 3090s over the course of several months. They were trained on a dataset consisting of +~50k hours of speech data, most of which was transcribed by [ocotillo](http://www.github.com/neonbjb/ocotillo). Training was done on my own +[DLAS](https://github.com/neonbjb/DL-Art-School) trainer. + +I currently do not have plans to release the training configurations or methodology. See the next section.. + +## Ethical Considerations + +Tortoise v2 works considerably better than I had planned. When I began hearing some of the outputs of the last few versions, I began +wondering whether or not I had an ethically unsound project on my hands. The ways in which a voice-cloning text-to-speech system +could be misused are many. It doesn't take much creativity to think up how. + +After some thought, I have decided to go forward with releasing this. Following are the reasons for this choice: + +1. It is primarily good at reading books and speaking poetry. Other forms of speech do not work well. +2. It was trained on a dataset which does not have the voices of public figures. While it will attempt to mimic these voices if they are provided as references, it does not do so in such a way that most humans would be fooled. +3. The above points could likely be resolved by scaling up the model and the dataset. For this reason, I am currently withholding details on how I trained the model, pending community feedback. +4. I am releasing a separate classifier model which will tell you whether a given audio clip was generated by Tortoise or not. See `tortoise-detect` above. +5. If I, a tinkerer with a BS in computer science with a ~$15k computer can build this, then any motivated corporation or state can as well. I would prefer that it be in the open and everyone know the kinds of things ML can do. + +### Diversity + +The diversity expressed by ML models is strongly tied to the datasets they were trained on. + +Tortoise was trained primarily on a dataset consisting of audiobooks. I made no effort to +balance diversity in this dataset. For this reason, Tortoise will be particularly poor at generating the voices of minorities +or of people who speak with strong accents. + +## Looking forward + +Tortoise v2 is about as good as I think I can do in the TTS world with the resources I have access to. A phenomenon that happens when +training very large models is that as parameter count increases, the communication bandwidth needed to support distributed training +of the model increases multiplicatively. On enterprise-grade hardware, this is not an issue: GPUs are attached together with +exceptionally wide buses that can accommodate this bandwidth. I cannot afford enterprise hardware, though, so I am stuck. + +I want to mention here +that I think Tortoise could do be a **lot** better. The three major components of Tortoise are either vanilla Transformer Encoder stacks +or Decoder stacks. Both of these types of models have a rich experimental history with scaling in the NLP realm. I see no reason +to believe that the same is not true of TTS. + +The largest model in Tortoise v2 is considerably smaller than GPT-2 large. It is 20x smaller that the original DALLE transformer. +Imagine what a TTS model trained at or near GPT-3 or DALLE scale could achieve. + +If you are an ethical organization with computational resources to spare interested in seeing what this model could do +if properly scaled out, please reach out to me! I would love to collaborate on this. + +## Acknowledgements + +This project has garnered more praise than I expected. I am standing on the shoulders of giants, though, and I want to +credit a few of the amazing folks in the community that have helped make this happen: + +- Hugging Face, who wrote the GPT model and the generate API used by Tortoise, and who hosts the model weights. +- [Ramesh et al](https://arxiv.org/pdf/2102.12092.pdf) who authored the DALLE paper, which is the inspiration behind Tortoise. +- [Nichol and Dhariwal](https://arxiv.org/pdf/2102.09672.pdf) who authored the (revision of) the code that drives the diffusion model. +- [Jang et al](https://arxiv.org/pdf/2106.07889.pdf) who developed and open-sourced univnet, the vocoder this repo uses. +- [lucidrains](https://github.com/lucidrains) who writes awesome open source pytorch models, many of which are used here. +- [Patrick von Platen](https://huggingface.co/patrickvonplaten) whose guides on setting up wav2vec were invaluable to building my dataset. + +## Notice + +Tortoise was built entirely by me using my own hardware. My employer was not involved in any facet of Tortoise's development. + +If you use this repo or the ideas therein for your research, please cite it! A bibtex entree can be found in the right pane on GitHub. \ No newline at end of file diff --git a/api.py b/api.py new file mode 100644 index 0000000000000000000000000000000000000000..6aa94cf693effabbd192db1e6f6fd3e2dc546702 --- /dev/null +++ b/api.py @@ -0,0 +1,373 @@ +import argparse +import os +import random +from urllib import request + +import torch +import torch.nn.functional as F +import progressbar +import torchaudio + +from models.classifier import AudioMiniEncoderWithClassifierHead +from models.cvvp import CVVP +from models.diffusion_decoder import DiffusionTts +from models.autoregressive import UnifiedVoice +from tqdm import tqdm + +from models.arch_util import TorchMelSpectrogram +from models.clvp import CLVP +from models.vocoder import UnivNetGenerator +from utils.audio import load_audio, wav_to_univnet_mel, denormalize_tacotron_mel +from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule +from utils.tokenizer import VoiceBpeTokenizer, lev_distance + + +pbar = None + + +def download_models(specific_models=None): + """ + Call to download all the models that Tortoise uses. + """ + MODELS = { + 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth', + 'classifier.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/classifier.pth', + 'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth', + 'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/cvvp.pth', + 'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/diffusion_decoder.pth', + 'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/vocoder.pth', + } + os.makedirs('.models', exist_ok=True) + def show_progress(block_num, block_size, total_size): + global pbar + if pbar is None: + pbar = progressbar.ProgressBar(maxval=total_size) + pbar.start() + + downloaded = block_num * block_size + if downloaded < total_size: + pbar.update(downloaded) + else: + pbar.finish() + pbar = None + for model_name, url in MODELS.items(): + if specific_models is not None and model_name not in specific_models: + continue + if os.path.exists(f'.models/{model_name}'): + continue + print(f'Downloading {model_name} from {url}...') + request.urlretrieve(url, f'.models/{model_name}', show_progress) + print('Done.') + + +def pad_or_truncate(t, length): + """ + Utility function for forcing to have the specified sequence length, whether by clipping it or padding it with 0s. + """ + if t.shape[-1] == length: + return t + elif t.shape[-1] < length: + return F.pad(t, (0, length-t.shape[-1])) + else: + return t[..., :length] + + +def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, cond_free_k=1): + """ + Helper function to load a GaussianDiffusion instance configured for use as a vocoder. + """ + return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), model_mean_type='epsilon', + model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps), + conditioning_free=cond_free, conditioning_free_k=cond_free_k) + + +def format_conditioning(clip, cond_length=132300): + """ + Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models. + """ + gap = clip.shape[-1] - cond_length + if gap < 0: + clip = F.pad(clip, pad=(0, abs(gap))) + elif gap > 0: + rand_start = random.randint(0, gap) + clip = clip[:, rand_start:rand_start + cond_length] + mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0) + return mel_clip.unsqueeze(0).cuda() + + +def fix_autoregressive_output(codes, stop_token, complain=True): + """ + This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was + trained on and what the autoregressive code generator creates (which has no padding or end). + This is highly specific to the DVAE being used, so this particular coding will not necessarily work if used with + a different DVAE. This can be inferred by feeding a audio clip padded with lots of zeros on the end through the DVAE + and copying out the last few codes. + + Failing to do this padding will produce speech with a harsh end that sounds like "BLAH" or similar. + """ + # Strip off the autoregressive stop token and add padding. + stop_token_indices = (codes == stop_token).nonzero() + if len(stop_token_indices) == 0: + if complain: + print("No stop tokens found, enjoy that output of yours!") + return codes + else: + codes[stop_token_indices] = 83 + stm = stop_token_indices.min().item() + codes[stm:] = 83 + if stm - 3 < codes.shape[0]: + codes[-3] = 45 + codes[-2] = 45 + codes[-1] = 248 + + return codes + + +def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_samples, temperature=1, verbose=True): + """ + Uses the specified diffusion model to convert discrete codes into a spectrogram. + """ + with torch.no_grad(): + cond_mels = [] + for sample in conditioning_samples: + # The diffuser operates at a sample rate of 24000 (except for the latent inputs) + sample = torchaudio.functional.resample(sample, 22050, 24000) + sample = pad_or_truncate(sample, 102400) + cond_mel = wav_to_univnet_mel(sample.to(latents.device), do_normalization=False) + cond_mels.append(cond_mel) + cond_mels = torch.stack(cond_mels, dim=1) + + output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. + output_shape = (latents.shape[0], 100, output_seq_len) + precomputed_embeddings = diffusion_model.timestep_independent(latents, cond_mels, output_seq_len, False) + + noise = torch.randn(output_shape, device=latents.device) * temperature + mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise, + model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, + progress=verbose) + return denormalize_tacotron_mel(mel)[:,:,:output_seq_len] + + +def classify_audio_clip(clip): + """ + Returns whether or not Tortoises' classifier thinks the given clip came from Tortoise. + :param clip: torch tensor containing audio waveform data (get it from load_audio) + :return: True if the clip was classified as coming from Tortoise and false if it was classified as real. + """ + download_models(['classifier.pth']) + classifier = AudioMiniEncoderWithClassifierHead(2, spec_dim=1, embedding_dim=512, depth=5, downsample_factor=4, + resnet_blocks=2, attn_blocks=4, num_attn_heads=4, base_channels=32, + dropout=0, kernel_size=5, distribute_zero_label=False) + classifier.load_state_dict(torch.load('.models/classifier.pth', map_location=torch.device('cpu'))) + clip = clip.cpu().unsqueeze(0) + results = F.softmax(classifier(clip), dim=-1) + return results[0][0] + + +class TextToSpeech: + """ + Main entry point into Tortoise. + :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing + GPU OOM errors. Larger numbers generates slightly faster. + """ + def __init__(self, autoregressive_batch_size=16): + self.autoregressive_batch_size = autoregressive_batch_size + self.tokenizer = VoiceBpeTokenizer() + download_models() + + self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, + model_dim=1024, + heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, + train_solo_embeddings=False, + average_conditioning_embeddings=True).cpu().eval() + self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth')) + + self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12, + text_seq_len=350, text_heads=8, + num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430, + use_xformers=True).cpu().eval() + self.clvp.load_state_dict(torch.load('.models/clvp.pth')) + + self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0, + speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval() + self.cvvp.load_state_dict(torch.load('.models/cvvp.pth')) + + self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200, + in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16, + layer_drop=0, unconditioned_percentage=0).cpu().eval() + self.diffusion.load_state_dict(torch.load('.models/diffusion_decoder.pth')) + + self.vocoder = UnivNetGenerator().cpu() + self.vocoder.load_state_dict(torch.load('.models/vocoder.pth')['model_g']) + self.vocoder.eval(inference=True) + + def tts_with_preset(self, text, voice_samples, preset='fast', **kwargs): + """ + Calls TTS with one of a set of preset generation parameters. Options: + 'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest). + 'fast': Decent quality speech at a decent inference rate. A good choice for mass inference. + 'standard': Very good quality. This is generally about as good as you are going to get. + 'high_quality': Use if you want the absolute best. This is not really worth the compute, though. + """ + # Use generally found best tuning knobs for generation. + kwargs.update({'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0, + #'typical_sampling': True, + 'top_p': .8, + 'cond_free_k': 2.0, 'diffusion_temperature': 1.0}) + # Presets are defined here. + presets = { + 'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False}, + 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32}, + 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128}, + 'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024}, + } + kwargs.update(presets[preset]) + return self.tts(text, voice_samples, **kwargs) + + def tts(self, text, voice_samples, k=1, verbose=True, + # autoregressive generation parameters follow + num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500, + typical_sampling=False, typical_mass=.9, + # CLVP & CVVP parameters + clvp_cvvp_slider=.5, + # diffusion generation parameters follow + diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, + **hf_generate_kwargs): + """ + Produces an audio clip of the given text being spoken with the given reference voice. + :param text: Text to be spoken. + :param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data. + :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP and CVVP models) clips are returned. + :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true. + ~~AUTOREGRESSIVE KNOBS~~ + :param num_autoregressive_samples: Number of samples taken from the autoregressive model, all of which are filtered using CLVP+CVVP. + As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great". + :param temperature: The softmax temperature of the autoregressive model. + :param length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs. + :param repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence + of long silences or "uhhhhhhs", etc. + :param top_p: P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely" (aka boring) outputs. + :param max_mel_tokens: Restricts the output length. (0,600] integer. Each unit is 1/20 of a second. + :param typical_sampling: Turns typical sampling on or off. This sampling mode is discussed in this paper: https://arxiv.org/abs/2202.00666 + I was interested in the premise, but the results were not as good as I was hoping. This is off by default, but + could use some tuning. + :param typical_mass: The typical_mass parameter from the typical_sampling algorithm. + ~~CLVP-CVVP KNOBS~~ + :param clvp_cvvp_slider: Controls the influence of the CLVP and CVVP models in selecting the best output from the autoregressive model. + [0,1]. Values closer to 1 will cause Tortoise to emit clips that follow the text more. Values closer to + 0 will cause Tortoise to emit clips that more closely follow the reference clip (e.g. the voice sounds more + similar). + ~~DIFFUSION KNOBS~~ + :param diffusion_iterations: Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine + the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, + however. + :param cond_free: Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for + each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output + of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and + dramatically improves realism. + :param cond_free_k: Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. + As cond_free_k increases, the output becomes dominated by the conditioning-free signal. + Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k + :param diffusion_temperature: Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 + are the "mean" prediction of the diffusion network and will sound bland and smeared. + ~~OTHER STUFF~~ + :param hf_generate_kwargs: The huggingface Transformers generate API is used for the autoregressive transformer. + Extra keyword args fed to this function get forwarded directly to that API. Documentation + here: https://huggingface.co/docs/transformers/internal/generation_utils + :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length. + Sample rate is 24kHz. + """ + text = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda() + text = F.pad(text, (0, 1)) # This may not be necessary. + + conds = [] + if not isinstance(voice_samples, list): + voice_samples = [voice_samples] + for vs in voice_samples: + conds.append(format_conditioning(vs)) + conds = torch.stack(conds, dim=1) + + diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) + + with torch.no_grad(): + samples = [] + num_batches = num_autoregressive_samples // self.autoregressive_batch_size + stop_mel_token = self.autoregressive.stop_mel_token + calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" + self.autoregressive = self.autoregressive.cuda() + if verbose: + print("Generating autoregressive samples..") + for b in tqdm(range(num_batches), disable=not verbose): + codes = self.autoregressive.inference_speech(conds, text, + do_sample=True, + top_p=top_p, + temperature=temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + max_generate_length=max_mel_tokens, + **hf_generate_kwargs) + padding_needed = max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) + samples.append(codes) + self.autoregressive = self.autoregressive.cpu() + + clip_results = [] + self.clvp = self.clvp.cuda() + self.cvvp = self.cvvp.cuda() + if verbose: + print("Computing best candidates using CLVP and CVVP") + for batch in tqdm(samples, disable=not verbose): + for i in range(batch.shape[0]): + batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) + clvp = self.clvp(text.repeat(batch.shape[0], 1), batch, return_loss=False) + cvvp_accumulator = 0 + for cl in range(conds.shape[1]): + cvvp_accumulator = cvvp_accumulator + self.cvvp(conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False ) + cvvp = cvvp_accumulator / conds.shape[1] + clip_results.append(clvp * clvp_cvvp_slider + cvvp * (1-clvp_cvvp_slider)) + clip_results = torch.cat(clip_results, dim=0) + samples = torch.cat(samples, dim=0) + best_results = samples[torch.topk(clip_results, k=k).indices] + self.clvp = self.clvp.cpu() + self.cvvp = self.cvvp.cpu() + del samples + + # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning + # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these + # results, but will increase memory usage. + self.autoregressive = self.autoregressive.cuda() + best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results, + torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device), + return_latent=True, clip_inputs=False) + self.autoregressive = self.autoregressive.cpu() + + if verbose: + print("Transforming autoregressive outputs into audio..") + wav_candidates = [] + self.diffusion = self.diffusion.cuda() + self.vocoder = self.vocoder.cuda() + for b in range(best_results.shape[0]): + codes = best_results[b].unsqueeze(0) + latents = best_latents[b].unsqueeze(0) + + # Find the first occurrence of the "calm" token and trim the codes to that. + ctokens = 0 + for k in range(codes.shape[-1]): + if codes[0, k] == calm_token: + ctokens += 1 + else: + ctokens = 0 + if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech. + latents = latents[:, :k] + break + + mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, voice_samples, temperature=diffusion_temperature, verbose=verbose) + wav = self.vocoder.inference(mel) + wav_candidates.append(wav.cpu()) + self.diffusion = self.diffusion.cpu() + self.vocoder = self.vocoder.cpu() + + if len(wav_candidates) > 1: + return wav_candidates + return wav_candidates[0] diff --git a/data/mel_norms.pth b/data/mel_norms.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed4d6e4f71fba223d920da25f1bbd0c8619433b5 --- /dev/null +++ b/data/mel_norms.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f69422a8a8f344c4fca2f0c6b8d41d2151d6615b7321e48e6bb15ae949b119c +size 1067 diff --git a/data/riding_hood.txt b/data/riding_hood.txt new file mode 100644 index 0000000000000000000000000000000000000000..2987bef78f92ecb327fc0f754b7ab1211a18542b --- /dev/null +++ b/data/riding_hood.txt @@ -0,0 +1,54 @@ +Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her. It suited the girl so extremely well that everybody called her Little Red Riding Hood. +One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter." + +Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. + +As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest. He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." + +"Does she live far off?" said the wolf + +"Oh I say," answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." + +"Well," said the wolf, "and I'll go and see her too. I'll go this way and go you that, and we shall see who will be there first." + +The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers. It was not long before the wolf arrived at the old woman's house. He knocked at the door: tap, tap. + +"Who's there?" + +"Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother." + +The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up." + +The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten. He then shut the door and got into the grandmother's bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. + +"Who's there?" + +Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you." + +The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." + +Little Red Riding Hood pulled the bobbin, and the door opened. + +The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." + +Little Red Riding Hood took off her clothes and got into bed. She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" + +"All the better to hug you with, my dear." + +"Grandmother, what big legs you have!" + +"All the better to run with, my child." + +"Grandmother, what big ears you have!" + +"All the better to hear with, my child." + +"Grandmother, what big eyes you have!" + +"All the better to see with, my child." + +"Grandmother, what big teeth you have got!" + +"All the better to eat you up with." + +And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up. \ No newline at end of file diff --git a/data/tokenizer.json b/data/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a128f273053e465a15c488e48d8106e0c8b0898e --- /dev/null +++ b/data/tokenizer.json @@ -0,0 +1 @@ +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} \ No newline at end of file diff --git a/do_tts.py b/do_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0347e64c587786a90eeb053f7efb388f323bf9 --- /dev/null +++ b/do_tts.py @@ -0,0 +1,34 @@ +import argparse +import os + +import torchaudio + +from api import TextToSpeech +from utils.audio import load_audio, get_voices + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.") + parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' + 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat') + parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard') + parser.add_argument('--voice_diversity_intelligibility_slider', type=float, + help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility', + default=.5) + parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/') + args = parser.parse_args() + os.makedirs(args.output_path, exist_ok=True) + + tts = TextToSpeech() + + voices = get_voices() + selected_voices = args.voice.split(',') + for voice in selected_voices: + cond_paths = voices[voice] + conds = [] + for cond_path in cond_paths: + c = load_audio(cond_path, 22050) + conds.append(c) + gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider) + torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000) + diff --git a/eval_multiple.py b/eval_multiple.py new file mode 100644 index 0000000000000000000000000000000000000000..9defa525e790a0a53ceff9940ffe5a6cda228d79 --- /dev/null +++ b/eval_multiple.py @@ -0,0 +1,38 @@ +import os + +import torchaudio + +from api import TextToSpeech +from utils.audio import load_audio + +if __name__ == '__main__': + fname = 'Y:\\clips\\books2\\subset512-oco.tsv' + stop_after = 128 + outpath_base = 'D:\\tmp\\tortoise-tts-eval\\audiobooks' + outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real' + + os.makedirs(outpath_real, exist_ok=True) + with open(fname, 'r', encoding='utf-8') as f: + lines = [l.strip().split('\t') for l in f.readlines()] + + tts = TextToSpeech() + for k in range(3): + outpath = f'{outpath_base}_{k}' + os.makedirs(outpath, exist_ok=True) + recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8') + for e, line in enumerate(lines): + if e >= stop_after: + break + transcript = line[0] + path = os.path.join(os.path.dirname(fname), line[1]) + cond_audio = load_audio(path, 22050) + torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050) + sample = tts.tts_with_preset(transcript, [cond_audio, cond_audio], preset='standard') + + down = torchaudio.functional.resample(sample, 24000, 22050) + fout_path = os.path.join(outpath, os.path.basename(line[1])) + torchaudio.save(fout_path, down.squeeze(0), 22050) + + recorder.write(f'{transcript}\t{fout_path}\n') + recorder.flush() + recorder.close() \ No newline at end of file diff --git a/examples/.gitattributes b/examples/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..e935e3907d228c22ed6b3f5217c05fb7a7c2d938 --- /dev/null +++ b/examples/.gitattributes @@ -0,0 +1,5 @@ +favorite_riding_hood.mp3 filter=lfs diff=lfs merge=lfs -text +favorites filter=lfs diff=lfs merge=lfs -text +riding_hood filter=lfs diff=lfs merge=lfs -text +tacotron_comparison filter=lfs diff=lfs merge=lfs -text +various filter=lfs diff=lfs merge=lfs -text diff --git a/examples/favorites/atkins_mha.mp3 b/examples/favorites/atkins_mha.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..63c2eb9d687927e1e67202e5066346502725fddd Binary files /dev/null and b/examples/favorites/atkins_mha.mp3 differ diff --git a/examples/favorites/atkins_omicron.mp3 b/examples/favorites/atkins_omicron.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f105eb07663e304d153b4406eeadd1bb060e1872 Binary files /dev/null and b/examples/favorites/atkins_omicron.mp3 differ diff --git a/examples/favorites/atkins_value.mp3 b/examples/favorites/atkins_value.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a7ae9ee50ebac567c284806d43a0b54b5444b456 Binary files /dev/null and b/examples/favorites/atkins_value.mp3 differ diff --git a/examples/favorites/daniel_craig_dumbledore.mp3 b/examples/favorites/daniel_craig_dumbledore.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3429af991cec6d828021f35b0367f5cf4f3744fa Binary files /dev/null and b/examples/favorites/daniel_craig_dumbledore.mp3 differ diff --git a/examples/favorites/daniel_craig_training_ethics.mp3 b/examples/favorites/daniel_craig_training_ethics.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2fa81b20e0e0358f1a66fdd8731e4b5b452a6034 Binary files /dev/null and b/examples/favorites/daniel_craig_training_ethics.mp3 differ diff --git a/examples/favorites/dotrice_stop_for_death.mp3 b/examples/favorites/dotrice_stop_for_death.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..02a13febb1ecaec43a7d2e87871df44b40efdcf6 Binary files /dev/null and b/examples/favorites/dotrice_stop_for_death.mp3 differ diff --git a/examples/favorites/emma_stone_courage.mp3 b/examples/favorites/emma_stone_courage.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5b86d18b8902499570a9c23cb6b0a3163784cfe5 Binary files /dev/null and b/examples/favorites/emma_stone_courage.mp3 differ diff --git a/examples/favorites/emma_stone_training_ethics.mp3 b/examples/favorites/emma_stone_training_ethics.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ef74dda3ab91cbc40c989058dc88b8502bab56b7 Binary files /dev/null and b/examples/favorites/emma_stone_training_ethics.mp3 differ diff --git a/examples/favorites/halle_barry_dumbledore.mp3 b/examples/favorites/halle_barry_dumbledore.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1587002679357c17d1ba92b795bfbe1cad872379 Binary files /dev/null and b/examples/favorites/halle_barry_dumbledore.mp3 differ diff --git a/examples/favorites/halle_barry_oar_to_oar.mp3 b/examples/favorites/halle_barry_oar_to_oar.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d8c43a572349895cb3770f0e7ecb069f8d53aaa5 Binary files /dev/null and b/examples/favorites/halle_barry_oar_to_oar.mp3 differ diff --git a/examples/favorites/henry_cavill_metallic_hydrogen.mp3 b/examples/favorites/henry_cavill_metallic_hydrogen.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..545aa5442c731d188ed4aa29c33db0bd5b413a3c Binary files /dev/null and b/examples/favorites/henry_cavill_metallic_hydrogen.mp3 differ diff --git a/examples/favorites/kennard_road_not_taken.mp3 b/examples/favorites/kennard_road_not_taken.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..449cacc8a2fca8095d3c89136459e73cbe90e4d2 Binary files /dev/null and b/examples/favorites/kennard_road_not_taken.mp3 differ diff --git a/examples/favorites/morgan_freeman_metallic_hydrogen.mp3 b/examples/favorites/morgan_freeman_metallic_hydrogen.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2217e73679529691d9b54dda8b96b757e32e449b Binary files /dev/null and b/examples/favorites/morgan_freeman_metallic_hydrogen.mp3 differ diff --git a/examples/favorites/myself_gatsby.mp3 b/examples/favorites/myself_gatsby.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cedc6e18ef0ac4a04745c15a836c5c2b52679d55 Binary files /dev/null and b/examples/favorites/myself_gatsby.mp3 differ diff --git a/examples/favorites/patrick_stewart_omicron.mp3 b/examples/favorites/patrick_stewart_omicron.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f3cd7240515cd0e7d81644974e5349e6cd529c15 Binary files /dev/null and b/examples/favorites/patrick_stewart_omicron.mp3 differ diff --git a/examples/favorites/patrick_stewart_secret_of_life.mp3 b/examples/favorites/patrick_stewart_secret_of_life.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1e073bdb903fd582a15ef1f07507847ec5b67c3f Binary files /dev/null and b/examples/favorites/patrick_stewart_secret_of_life.mp3 differ diff --git a/examples/favorites/robert_deniro_review.mp3 b/examples/favorites/robert_deniro_review.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5c1a8960a3d0bfcc62dcc88b43ef671f480dba95 Binary files /dev/null and b/examples/favorites/robert_deniro_review.mp3 differ diff --git a/examples/favorites/william_shatner_spacecraft_interview.mp3 b/examples/favorites/william_shatner_spacecraft_interview.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cdd7e07fef67c32c65576d354a09ca8dafe67e77 Binary files /dev/null and b/examples/favorites/william_shatner_spacecraft_interview.mp3 differ diff --git a/examples/riding_hood/angelina.mp3 b/examples/riding_hood/angelina.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2de43aca32ebc9e46952b256daa9ef1d646a0914 Binary files /dev/null and b/examples/riding_hood/angelina.mp3 differ diff --git a/examples/riding_hood/craig.mp3 b/examples/riding_hood/craig.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..adce4943c3da24a763450ea496034ac43c4c3e14 Binary files /dev/null and b/examples/riding_hood/craig.mp3 differ diff --git a/examples/riding_hood/deniro.mp3 b/examples/riding_hood/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a4257958b7ee8e7b55b590eaf54075507730c9c9 Binary files /dev/null and b/examples/riding_hood/deniro.mp3 differ diff --git a/examples/riding_hood/emma.mp3 b/examples/riding_hood/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d2ddaf3f5725086960e62df47cc06961479f4b14 Binary files /dev/null and b/examples/riding_hood/emma.mp3 differ diff --git a/examples/riding_hood/freeman.mp3 b/examples/riding_hood/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c0268d1aecd231a077cf16c1ab10c4dfe6e1302e Binary files /dev/null and b/examples/riding_hood/freeman.mp3 differ diff --git a/examples/riding_hood/geralt.mp3 b/examples/riding_hood/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..101faaa5faf841db886a67cd3a3a4814aa354ca0 Binary files /dev/null and b/examples/riding_hood/geralt.mp3 differ diff --git a/examples/riding_hood/halle.mp3 b/examples/riding_hood/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b2dca2845accc74cc0acd7b9ca24e97105405186 Binary files /dev/null and b/examples/riding_hood/halle.mp3 differ diff --git a/examples/riding_hood/jlaw.mp3 b/examples/riding_hood/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..08e40d198e4fe73dd9a39ff80665cadfabce6360 Binary files /dev/null and b/examples/riding_hood/jlaw.mp3 differ diff --git a/examples/riding_hood/lj.mp3 b/examples/riding_hood/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f6082dd1d78a9c1963da19ce36defc97c2ddb314 Binary files /dev/null and b/examples/riding_hood/lj.mp3 differ diff --git a/examples/riding_hood/myself.mp3 b/examples/riding_hood/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b5ec4762bfc978acbfd8584d8b9689b05a0759d5 Binary files /dev/null and b/examples/riding_hood/myself.mp3 differ diff --git a/examples/riding_hood/pat.mp3 b/examples/riding_hood/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..736a9d0a2f269c1d12871fd57578846fa1d937fc Binary files /dev/null and b/examples/riding_hood/pat.mp3 differ diff --git a/examples/riding_hood/snakes.mp3 b/examples/riding_hood/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b70f10189e57e6db4e2475cc07fc72f9eef49338 Binary files /dev/null and b/examples/riding_hood/snakes.mp3 differ diff --git a/examples/riding_hood/tom.mp3 b/examples/riding_hood/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0727a553dff12319c880a92100e4ef13ad4f570d Binary files /dev/null and b/examples/riding_hood/tom.mp3 differ diff --git a/examples/riding_hood/weaver.mp3 b/examples/riding_hood/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..adfc3229304e42730bb49b60d0413598d4edba3f Binary files /dev/null and b/examples/riding_hood/weaver.mp3 differ diff --git a/examples/riding_hood/william.mp3 b/examples/riding_hood/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5dd86bac9c17fe19643bac25eef33698f11f84e9 Binary files /dev/null and b/examples/riding_hood/william.mp3 differ diff --git a/examples/tacotron_comparison/2-tacotron2.mp3 b/examples/tacotron_comparison/2-tacotron2.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..161f3850692a9052de963d4ed29d9359ceb318af Binary files /dev/null and b/examples/tacotron_comparison/2-tacotron2.mp3 differ diff --git a/examples/tacotron_comparison/2-tortoise.mp3 b/examples/tacotron_comparison/2-tortoise.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..18cfbd7162356d1613d8db327d8449140fca818c Binary files /dev/null and b/examples/tacotron_comparison/2-tortoise.mp3 differ diff --git a/examples/tacotron_comparison/3-tacotron2.mp3 b/examples/tacotron_comparison/3-tacotron2.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3cd6ed85cc696b4acdb3196446baf0a8f7b3cfbf Binary files /dev/null and b/examples/tacotron_comparison/3-tacotron2.mp3 differ diff --git a/examples/tacotron_comparison/3-tortoise.mp3 b/examples/tacotron_comparison/3-tortoise.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..73d70d2877e5bc717c16564322fa05d6c75427e3 Binary files /dev/null and b/examples/tacotron_comparison/3-tortoise.mp3 differ diff --git a/examples/tacotron_comparison/4-tacotron2.mp3 b/examples/tacotron_comparison/4-tacotron2.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b4d5cd393fa503faecbb0082bb466db2b86d6e64 Binary files /dev/null and b/examples/tacotron_comparison/4-tacotron2.mp3 differ diff --git a/examples/tacotron_comparison/4-tortoise.mp3 b/examples/tacotron_comparison/4-tortoise.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d8632aedb1843a17079b3cc70e8c35371c4ebe2e Binary files /dev/null and b/examples/tacotron_comparison/4-tortoise.mp3 differ diff --git a/examples/various/autoregressive_ml/angie.mp3 b/examples/various/autoregressive_ml/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..06896ebfb6d0434c1386a8073e409f859ed717c6 Binary files /dev/null and b/examples/various/autoregressive_ml/angie.mp3 differ diff --git a/examples/various/autoregressive_ml/daniel.mp3 b/examples/various/autoregressive_ml/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..debb102109b20fadf6699920a7fa8ae0e6078f1a Binary files /dev/null and b/examples/various/autoregressive_ml/daniel.mp3 differ diff --git a/examples/various/autoregressive_ml/deniro.mp3 b/examples/various/autoregressive_ml/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..656e889491eed2609140d53ecf284899cf4ebcce Binary files /dev/null and b/examples/various/autoregressive_ml/deniro.mp3 differ diff --git a/examples/various/autoregressive_ml/emma.mp3 b/examples/various/autoregressive_ml/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..19972c8751fc414b08e8f9bd590dda7b3f4ed496 Binary files /dev/null and b/examples/various/autoregressive_ml/emma.mp3 differ diff --git a/examples/various/autoregressive_ml/freeman.mp3 b/examples/various/autoregressive_ml/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..86c8fcb74d6a1d77ce851be7de2ab099cd6807f1 Binary files /dev/null and b/examples/various/autoregressive_ml/freeman.mp3 differ diff --git a/examples/various/autoregressive_ml/geralt.mp3 b/examples/various/autoregressive_ml/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..427c3407931f58ac504c4a725fc41ae8bbb10ef8 Binary files /dev/null and b/examples/various/autoregressive_ml/geralt.mp3 differ diff --git a/examples/various/autoregressive_ml/grace_train.mp3 b/examples/various/autoregressive_ml/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ed4c391d0c2dbf6f561a9a8158681334d94f953b Binary files /dev/null and b/examples/various/autoregressive_ml/grace_train.mp3 differ diff --git a/examples/various/autoregressive_ml/halle.mp3 b/examples/various/autoregressive_ml/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0905d8663c9ee1bc557d9c9f8c0986343d3b51b8 Binary files /dev/null and b/examples/various/autoregressive_ml/halle.mp3 differ diff --git a/examples/various/autoregressive_ml/jlaw.mp3 b/examples/various/autoregressive_ml/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..42dcb0b8dfdedf26dad8cea7016efc3f685015c9 Binary files /dev/null and b/examples/various/autoregressive_ml/jlaw.mp3 differ diff --git a/examples/various/autoregressive_ml/lj.mp3 b/examples/various/autoregressive_ml/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2de87936650f292747bc7017b84078b89ae1a945 Binary files /dev/null and b/examples/various/autoregressive_ml/lj.mp3 differ diff --git a/examples/various/autoregressive_ml/myself.mp3 b/examples/various/autoregressive_ml/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8819c195fca2e3df9f00b981b928c8b3e250b966 Binary files /dev/null and b/examples/various/autoregressive_ml/myself.mp3 differ diff --git a/examples/various/autoregressive_ml/pat.mp3 b/examples/various/autoregressive_ml/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..348955d1f64d296fdb4a19f73a5b6c22bebe30f4 Binary files /dev/null and b/examples/various/autoregressive_ml/pat.mp3 differ diff --git a/examples/various/autoregressive_ml/snakes.mp3 b/examples/various/autoregressive_ml/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..59efab9486e9c8b0138ae57dc8af37a04e4f855c Binary files /dev/null and b/examples/various/autoregressive_ml/snakes.mp3 differ diff --git a/examples/various/autoregressive_ml/tom.mp3 b/examples/various/autoregressive_ml/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3d9356928ec35f3cad27b5ebbe6308309e30369b Binary files /dev/null and b/examples/various/autoregressive_ml/tom.mp3 differ diff --git a/examples/various/autoregressive_ml/train_atkins.mp3 b/examples/various/autoregressive_ml/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..63c2eb9d687927e1e67202e5066346502725fddd Binary files /dev/null and b/examples/various/autoregressive_ml/train_atkins.mp3 differ diff --git a/examples/various/autoregressive_ml/train_dotrice.mp3 b/examples/various/autoregressive_ml/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8585635d6fdbd8cc42a4a07f2e559683d0265ad9 Binary files /dev/null and b/examples/various/autoregressive_ml/train_dotrice.mp3 differ diff --git a/examples/various/autoregressive_ml/train_kennard.mp3 b/examples/various/autoregressive_ml/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6d2b92c9c4fce7e87ed6e2250348a5f340c3f935 Binary files /dev/null and b/examples/various/autoregressive_ml/train_kennard.mp3 differ diff --git a/examples/various/autoregressive_ml/weaver.mp3 b/examples/various/autoregressive_ml/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..daf45f77f9103892dc22407975a2c87a8b2f5c7a Binary files /dev/null and b/examples/various/autoregressive_ml/weaver.mp3 differ diff --git a/examples/various/autoregressive_ml/william.mp3 b/examples/various/autoregressive_ml/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0af2fd826d935882506786d5799ad059bfd607c0 Binary files /dev/null and b/examples/various/autoregressive_ml/william.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/angie.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..68f5eeab4aea7f9f50fea219c12c2294262726c9 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/angie.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/daniel.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2fa81b20e0e0358f1a66fdd8731e4b5b452a6034 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/daniel.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/deniro.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..30a4ee95327add221de8c8487d83c0553fb165d1 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/deniro.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/emma.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ef74dda3ab91cbc40c989058dc88b8502bab56b7 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/emma.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/freeman.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e9ad46c976404abf055edc869793a061c3073471 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/freeman.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/geralt.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ea327a078f971df195ac66888fcf8edf595d3148 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/geralt.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/grace_train.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3f23452b871be8a814a4d28bfc45f0757a23eb17 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/grace_train.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/halle.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9572c5923ad4c00dac2948475817a55891a85922 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/halle.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/jlaw.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1f424f9c345cb36e3694b6e121fb42faee3966d2 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/jlaw.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/lj.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..912951ab12f6a7018ff119d7c8cb253fdb444780 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/lj.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/myself.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c7ae71bd7ae51f434eff7e5d0ac1d794e6167eff Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/myself.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/pat.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..181f1f3a74e46fea519562d4ac614d1a8e9e16f3 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/pat.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/snakes.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9382d9a9f8301f196211866907adeb1d3677b37e Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/snakes.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/tom.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..13e886d86214488b071a2547d8041361804920a1 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/tom.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/train_atkins.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7fb8c98998c107d94dc05a589efbc7558808a3cf Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/train_atkins.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/train_dotrice.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5a5285420ba6a6a5e9172c6160b18981bbc41e0b Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/train_dotrice.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/train_kennard.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2015aec0ee6e68d2505fc545ea0e5e4da7b9ca5c Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/train_kennard.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/weaver.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..85d18da076408bb5fa14553b4fa942832f972096 Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/weaver.mp3 differ diff --git a/examples/various/bengio_it_needs_to_know_what_is_bad/william.mp3 b/examples/various/bengio_it_needs_to_know_what_is_bad/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ab5c04c42c1704156b62f2c45d3d018e988cb03c Binary files /dev/null and b/examples/various/bengio_it_needs_to_know_what_is_bad/william.mp3 differ diff --git a/examples/various/desktop.ini b/examples/various/desktop.ini new file mode 100644 index 0000000000000000000000000000000000000000..d957fd188dbc10a2a74cd31f9982e1cfcc0c4a28 --- /dev/null +++ b/examples/various/desktop.ini @@ -0,0 +1,4 @@ +[ViewState] +Mode= +Vid= +FolderType=Generic diff --git a/examples/various/dickinson_stop_for_death/angie.mp3 b/examples/various/dickinson_stop_for_death/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0524ee2cb629c3df80cb32b6c7f64b7e3a52aee7 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/angie.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/daniel.mp3 b/examples/various/dickinson_stop_for_death/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..55bca8fd0e2e562a073f25f797f42e131773b890 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/daniel.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/deniro.mp3 b/examples/various/dickinson_stop_for_death/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b4aa9f502c65f0787ca5de47bd07305dbf9d4aea Binary files /dev/null and b/examples/various/dickinson_stop_for_death/deniro.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/emma.mp3 b/examples/various/dickinson_stop_for_death/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6d59d3257f49052ca8cf23507596bac2762feaf0 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/emma.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/freeman.mp3 b/examples/various/dickinson_stop_for_death/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3c78a491cc7251dd21565ce9ecc4294331bf27e4 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/freeman.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/geralt.mp3 b/examples/various/dickinson_stop_for_death/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..91bd5d7cc7fcd589698c75c0ce10bbc747bc8fdf Binary files /dev/null and b/examples/various/dickinson_stop_for_death/geralt.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/grace_train.mp3 b/examples/various/dickinson_stop_for_death/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7b0e7dc162d8d5deef599e01f6eab41be6313fd2 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/grace_train.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/halle.mp3 b/examples/various/dickinson_stop_for_death/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0f4dd03c43d53fd56ee2a4450c44eccc285de577 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/halle.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/jlaw.mp3 b/examples/various/dickinson_stop_for_death/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4bbeb322ff56008a932dd6e736bd25f370f1f9c1 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/jlaw.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/lj.mp3 b/examples/various/dickinson_stop_for_death/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..74a1586f6fe5d327b2759ed9737042ecfeca6ca9 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/lj.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/myself.mp3 b/examples/various/dickinson_stop_for_death/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a7a4438d21e34ccfb72aaf2c96fa406afe37d85d Binary files /dev/null and b/examples/various/dickinson_stop_for_death/myself.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/pat.mp3 b/examples/various/dickinson_stop_for_death/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c50b91268b176c934b48f20271373e6353e5162c Binary files /dev/null and b/examples/various/dickinson_stop_for_death/pat.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/snakes.mp3 b/examples/various/dickinson_stop_for_death/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f6e2bfe4998969eb7363444a749d2ced8cfa4d11 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/snakes.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/tom.mp3 b/examples/various/dickinson_stop_for_death/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8baa5bccf3d82cd783b14adc0cccacd486469cc4 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/tom.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/train_atkins.mp3 b/examples/various/dickinson_stop_for_death/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4c85f7c77341a1fd37a4a4e59ca15a78172ed29a Binary files /dev/null and b/examples/various/dickinson_stop_for_death/train_atkins.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/train_dotrice.mp3 b/examples/various/dickinson_stop_for_death/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..02a13febb1ecaec43a7d2e87871df44b40efdcf6 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/train_dotrice.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/train_kennard.mp3 b/examples/various/dickinson_stop_for_death/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b7f7c8283a261045e97c93b6de68990e2cb25ec9 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/train_kennard.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/weaver.mp3 b/examples/various/dickinson_stop_for_death/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..db74c7879c933b85fb05184d37fc43a3fe18d7d5 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/weaver.mp3 differ diff --git a/examples/various/dickinson_stop_for_death/william.mp3 b/examples/various/dickinson_stop_for_death/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9a19e012c53681c5ec1ea807720fefee49bc2af7 Binary files /dev/null and b/examples/various/dickinson_stop_for_death/william.mp3 differ diff --git a/examples/various/espn_basketball/angie.mp3 b/examples/various/espn_basketball/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9734e5d64d546e6ab960dc234598ccf3b0d11f5f Binary files /dev/null and b/examples/various/espn_basketball/angie.mp3 differ diff --git a/examples/various/espn_basketball/daniel.mp3 b/examples/various/espn_basketball/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e2a3e7041defd8334d05ed1ae8571058811ef391 Binary files /dev/null and b/examples/various/espn_basketball/daniel.mp3 differ diff --git a/examples/various/espn_basketball/deniro.mp3 b/examples/various/espn_basketball/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c36bd7bff4cc5afe63d7ceb7fe5d6c1878b75fdd Binary files /dev/null and b/examples/various/espn_basketball/deniro.mp3 differ diff --git a/examples/various/espn_basketball/emma.mp3 b/examples/various/espn_basketball/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e72445d7653c0a228303d14ba60863279a198a29 Binary files /dev/null and b/examples/various/espn_basketball/emma.mp3 differ diff --git a/examples/various/espn_basketball/freeman.mp3 b/examples/various/espn_basketball/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5d2559d88e342eb834a2bb006b852e832ac3c327 Binary files /dev/null and b/examples/various/espn_basketball/freeman.mp3 differ diff --git a/examples/various/espn_basketball/geralt.mp3 b/examples/various/espn_basketball/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a6230ce48d28d77235ca2bd7d792f44392c65958 Binary files /dev/null and b/examples/various/espn_basketball/geralt.mp3 differ diff --git a/examples/various/espn_basketball/grace_train.mp3 b/examples/various/espn_basketball/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..eca5f6bfa0e52a175215cd39c504f4edd8d2b33f Binary files /dev/null and b/examples/various/espn_basketball/grace_train.mp3 differ diff --git a/examples/various/espn_basketball/halle.mp3 b/examples/various/espn_basketball/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..93bccd8f49e1e411a15a52faa10fe1935f0b2465 Binary files /dev/null and b/examples/various/espn_basketball/halle.mp3 differ diff --git a/examples/various/espn_basketball/jlaw.mp3 b/examples/various/espn_basketball/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ce7857cf618f45e209d62d6f7cd6cc40b3bfaedc Binary files /dev/null and b/examples/various/espn_basketball/jlaw.mp3 differ diff --git a/examples/various/espn_basketball/lj.mp3 b/examples/various/espn_basketball/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..909f9293dc208b88dafb5c28bcbb909280bb7235 Binary files /dev/null and b/examples/various/espn_basketball/lj.mp3 differ diff --git a/examples/various/espn_basketball/myself.mp3 b/examples/various/espn_basketball/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..76e0d7da3fb5f016fa0ffcb12d94e5a2d7203913 Binary files /dev/null and b/examples/various/espn_basketball/myself.mp3 differ diff --git a/examples/various/espn_basketball/pat.mp3 b/examples/various/espn_basketball/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..41a03e9f848d2669bde519974aeb148f8dfd7273 Binary files /dev/null and b/examples/various/espn_basketball/pat.mp3 differ diff --git a/examples/various/espn_basketball/snakes.mp3 b/examples/various/espn_basketball/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..664ff621587137dd78555e88eee78a1f09e5d1cf Binary files /dev/null and b/examples/various/espn_basketball/snakes.mp3 differ diff --git a/examples/various/espn_basketball/tom.mp3 b/examples/various/espn_basketball/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8fb46560030a2ad5ab4f9b082d4b8e77e4a28a6b Binary files /dev/null and b/examples/various/espn_basketball/tom.mp3 differ diff --git a/examples/various/espn_basketball/train_atkins.mp3 b/examples/various/espn_basketball/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1270da139fcb39a398699d7a3343d4968afb17fd Binary files /dev/null and b/examples/various/espn_basketball/train_atkins.mp3 differ diff --git a/examples/various/espn_basketball/train_dotrice.mp3 b/examples/various/espn_basketball/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b2720f03e88b65a9ca22406fb01cf7761d6060b3 Binary files /dev/null and b/examples/various/espn_basketball/train_dotrice.mp3 differ diff --git a/examples/various/espn_basketball/train_kennard.mp3 b/examples/various/espn_basketball/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..13948c0b6870c7aa95a1c06c98dd0c6868cf7b3d Binary files /dev/null and b/examples/various/espn_basketball/train_kennard.mp3 differ diff --git a/examples/various/espn_basketball/weaver.mp3 b/examples/various/espn_basketball/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..672755f09adc21ae59aefd326f3cf1bc39a2ddf9 Binary files /dev/null and b/examples/various/espn_basketball/weaver.mp3 differ diff --git a/examples/various/espn_basketball/william.mp3 b/examples/various/espn_basketball/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..69368ab6165ea5999558863beeaa7ada611a028b Binary files /dev/null and b/examples/various/espn_basketball/william.mp3 differ diff --git a/examples/various/frost_oar_to_oar/angie.mp3 b/examples/various/frost_oar_to_oar/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a3e6b53635d28c716fd4068ba1611fcd303687f1 Binary files /dev/null and b/examples/various/frost_oar_to_oar/angie.mp3 differ diff --git a/examples/various/frost_oar_to_oar/daniel.mp3 b/examples/various/frost_oar_to_oar/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fa93d0d5155050b9f954f4a07615456bce4d024a Binary files /dev/null and b/examples/various/frost_oar_to_oar/daniel.mp3 differ diff --git a/examples/various/frost_oar_to_oar/deniro.mp3 b/examples/various/frost_oar_to_oar/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4502426d30f83dec1f7c096417c7de45467a7f77 Binary files /dev/null and b/examples/various/frost_oar_to_oar/deniro.mp3 differ diff --git a/examples/various/frost_oar_to_oar/emma.mp3 b/examples/various/frost_oar_to_oar/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f73f487c3b30c6e1f8e1264c2982646f6f0fb3c8 Binary files /dev/null and b/examples/various/frost_oar_to_oar/emma.mp3 differ diff --git a/examples/various/frost_oar_to_oar/freeman.mp3 b/examples/various/frost_oar_to_oar/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..71043d9681c2e4ee2dd762abd41d99ca11ca0ce9 Binary files /dev/null and b/examples/various/frost_oar_to_oar/freeman.mp3 differ diff --git a/examples/various/frost_oar_to_oar/geralt.mp3 b/examples/various/frost_oar_to_oar/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..66ebd2b9afcd88c525ed7c4e0d5492c8ef20f46c Binary files /dev/null and b/examples/various/frost_oar_to_oar/geralt.mp3 differ diff --git a/examples/various/frost_oar_to_oar/grace_train.mp3 b/examples/various/frost_oar_to_oar/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0b716e1891e4ec8ba3b5521d1cb38c01c6d7aff3 Binary files /dev/null and b/examples/various/frost_oar_to_oar/grace_train.mp3 differ diff --git a/examples/various/frost_oar_to_oar/halle.mp3 b/examples/various/frost_oar_to_oar/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d8c43a572349895cb3770f0e7ecb069f8d53aaa5 Binary files /dev/null and b/examples/various/frost_oar_to_oar/halle.mp3 differ diff --git a/examples/various/frost_oar_to_oar/jlaw.mp3 b/examples/various/frost_oar_to_oar/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d1af59531ba314f4a74073a35458cd5268c83007 Binary files /dev/null and b/examples/various/frost_oar_to_oar/jlaw.mp3 differ diff --git a/examples/various/frost_oar_to_oar/lj.mp3 b/examples/various/frost_oar_to_oar/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..50a1dca3bfdbb4a1ed63004c67accc43a2f91c3a Binary files /dev/null and b/examples/various/frost_oar_to_oar/lj.mp3 differ diff --git a/examples/various/frost_oar_to_oar/myself.mp3 b/examples/various/frost_oar_to_oar/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a31163be1bb327aa95d243e6c0e33dac18ad9d6e Binary files /dev/null and b/examples/various/frost_oar_to_oar/myself.mp3 differ diff --git a/examples/various/frost_oar_to_oar/pat.mp3 b/examples/various/frost_oar_to_oar/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..da4df4d4032380c54cce15da28412515d79296f1 Binary files /dev/null and b/examples/various/frost_oar_to_oar/pat.mp3 differ diff --git a/examples/various/frost_oar_to_oar/snakes.mp3 b/examples/various/frost_oar_to_oar/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..355c955b83b7e84a8875e7c4709231af26110a68 Binary files /dev/null and b/examples/various/frost_oar_to_oar/snakes.mp3 differ diff --git a/examples/various/frost_oar_to_oar/tom.mp3 b/examples/various/frost_oar_to_oar/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..960febe8f1a2db1ce2284a78b54d2f0c8b56ed43 Binary files /dev/null and b/examples/various/frost_oar_to_oar/tom.mp3 differ diff --git a/examples/various/frost_oar_to_oar/train_atkins.mp3 b/examples/various/frost_oar_to_oar/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4b515e664395f012e3bed6d9a4b26cf999edf630 Binary files /dev/null and b/examples/various/frost_oar_to_oar/train_atkins.mp3 differ diff --git a/examples/various/frost_oar_to_oar/train_dotrice.mp3 b/examples/various/frost_oar_to_oar/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..659d6fdfd77a53db5fd6e2e57d6423fb468674f4 Binary files /dev/null and b/examples/various/frost_oar_to_oar/train_dotrice.mp3 differ diff --git a/examples/various/frost_oar_to_oar/train_kennard.mp3 b/examples/various/frost_oar_to_oar/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2fc9cfd5236de3f5aa82f6960494d67c58606fcc Binary files /dev/null and b/examples/various/frost_oar_to_oar/train_kennard.mp3 differ diff --git a/examples/various/frost_oar_to_oar/weaver.mp3 b/examples/various/frost_oar_to_oar/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..14b3020f867a1e49a38a6de7ad0d2ed20a718660 Binary files /dev/null and b/examples/various/frost_oar_to_oar/weaver.mp3 differ diff --git a/examples/various/frost_oar_to_oar/william.mp3 b/examples/various/frost_oar_to_oar/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e60100cab27c7acd6b5da1ac8c5cd3957579f249 Binary files /dev/null and b/examples/various/frost_oar_to_oar/william.mp3 differ diff --git a/examples/various/frost_road_not_taken/angie.mp3 b/examples/various/frost_road_not_taken/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..600ccb3bf22cf93b85642b1681797a81e3c39d11 Binary files /dev/null and b/examples/various/frost_road_not_taken/angie.mp3 differ diff --git a/examples/various/frost_road_not_taken/daniel.mp3 b/examples/various/frost_road_not_taken/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..da5991e0e22f3d5421711356b08cdcc8653d5efb Binary files /dev/null and b/examples/various/frost_road_not_taken/daniel.mp3 differ diff --git a/examples/various/frost_road_not_taken/deniro.mp3 b/examples/various/frost_road_not_taken/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ca6ff51e9c656fd1159115362d23016a486e756c Binary files /dev/null and b/examples/various/frost_road_not_taken/deniro.mp3 differ diff --git a/examples/various/frost_road_not_taken/emma.mp3 b/examples/various/frost_road_not_taken/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3474e84dc0b8cfe570920531c1bebdc9b3bf8039 Binary files /dev/null and b/examples/various/frost_road_not_taken/emma.mp3 differ diff --git a/examples/various/frost_road_not_taken/freeman.mp3 b/examples/various/frost_road_not_taken/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3b8c93fd593bf9a879134af3f3615b0418ad783b Binary files /dev/null and b/examples/various/frost_road_not_taken/freeman.mp3 differ diff --git a/examples/various/frost_road_not_taken/geralt.mp3 b/examples/various/frost_road_not_taken/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..441140cae0eb88358c4cf639ed60e42ca19ae0c4 Binary files /dev/null and b/examples/various/frost_road_not_taken/geralt.mp3 differ diff --git a/examples/various/frost_road_not_taken/grace_train.mp3 b/examples/various/frost_road_not_taken/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..45029f7bcfab270e6d38ca074b5443bc710cb6b5 Binary files /dev/null and b/examples/various/frost_road_not_taken/grace_train.mp3 differ diff --git a/examples/various/frost_road_not_taken/halle.mp3 b/examples/various/frost_road_not_taken/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5d3205ea8ff9e7b74ed314c33848c5cb1be80104 Binary files /dev/null and b/examples/various/frost_road_not_taken/halle.mp3 differ diff --git a/examples/various/frost_road_not_taken/jlaw.mp3 b/examples/various/frost_road_not_taken/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8a96f4727612884f54a76049f85235c1667587a4 Binary files /dev/null and b/examples/various/frost_road_not_taken/jlaw.mp3 differ diff --git a/examples/various/frost_road_not_taken/lj.mp3 b/examples/various/frost_road_not_taken/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..75f9fd0fb7e004f5499d0b09cf18ea76f5d2e651 Binary files /dev/null and b/examples/various/frost_road_not_taken/lj.mp3 differ diff --git a/examples/various/frost_road_not_taken/myself.mp3 b/examples/various/frost_road_not_taken/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..29294bbda04ccecd0ae88b69dc6742f413c47cc2 Binary files /dev/null and b/examples/various/frost_road_not_taken/myself.mp3 differ diff --git a/examples/various/frost_road_not_taken/pat.mp3 b/examples/various/frost_road_not_taken/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..90addd1ca131294180aba9e606dfa9f347b797e7 Binary files /dev/null and b/examples/various/frost_road_not_taken/pat.mp3 differ diff --git a/examples/various/frost_road_not_taken/snakes.mp3 b/examples/various/frost_road_not_taken/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..db2d8c6f669d665a18b2f26ed3a95ad96d71890a Binary files /dev/null and b/examples/various/frost_road_not_taken/snakes.mp3 differ diff --git a/examples/various/frost_road_not_taken/tom.mp3 b/examples/various/frost_road_not_taken/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2806f6549057f68af42fd17f54d1851f9d29b780 Binary files /dev/null and b/examples/various/frost_road_not_taken/tom.mp3 differ diff --git a/examples/various/frost_road_not_taken/train_atkins.mp3 b/examples/various/frost_road_not_taken/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6822821f864e49c3889ac11fc04b33cb0364c093 Binary files /dev/null and b/examples/various/frost_road_not_taken/train_atkins.mp3 differ diff --git a/examples/various/frost_road_not_taken/train_dotrice.mp3 b/examples/various/frost_road_not_taken/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e44066ed7b9ac2814cdf03f6ed92b7f6db03ef90 Binary files /dev/null and b/examples/various/frost_road_not_taken/train_dotrice.mp3 differ diff --git a/examples/various/frost_road_not_taken/train_kennard.mp3 b/examples/various/frost_road_not_taken/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..449cacc8a2fca8095d3c89136459e73cbe90e4d2 Binary files /dev/null and b/examples/various/frost_road_not_taken/train_kennard.mp3 differ diff --git a/examples/various/frost_road_not_taken/weaver.mp3 b/examples/various/frost_road_not_taken/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4abe2a01efbe0e82489c8860f2e4a19dd1f91c6f Binary files /dev/null and b/examples/various/frost_road_not_taken/weaver.mp3 differ diff --git a/examples/various/frost_road_not_taken/william.mp3 b/examples/various/frost_road_not_taken/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ea5191e8dea3b2402f3815bf0f71ed06f8f65f3a Binary files /dev/null and b/examples/various/frost_road_not_taken/william.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/angie.mp3 b/examples/various/gatsby_and_so_we_beat_on/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..227de907226e0046bf155136ba971f8e8dd301c8 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/angie.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/daniel.mp3 b/examples/various/gatsby_and_so_we_beat_on/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9f4de62c745f0bfa077a568ead9bc926bb8e28b2 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/daniel.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/deniro.mp3 b/examples/various/gatsby_and_so_we_beat_on/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9e5a878145ca9d5d12cef4fe50705533c6411e3d Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/deniro.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/emma.mp3 b/examples/various/gatsby_and_so_we_beat_on/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8349ed7d3701fa564996b5aadfa498707f9eb50b Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/emma.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/freeman.mp3 b/examples/various/gatsby_and_so_we_beat_on/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8f47e55d478c8b50656a078a809a3302a8317f35 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/freeman.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/geralt.mp3 b/examples/various/gatsby_and_so_we_beat_on/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..112e4919470cd7c3b06075a2948738ad86838c48 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/geralt.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/grace_train.mp3 b/examples/various/gatsby_and_so_we_beat_on/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9893e3077fb3c1f63bd4938ca813085b21250df4 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/grace_train.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/halle.mp3 b/examples/various/gatsby_and_so_we_beat_on/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c22f442c94ee5b536ff7ce5544d0e8a57a0ca4e0 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/halle.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/jlaw.mp3 b/examples/various/gatsby_and_so_we_beat_on/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..29391df255ce9df3c0895775ee90c52138061d36 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/jlaw.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/lj.mp3 b/examples/various/gatsby_and_so_we_beat_on/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5cb3cb340806b84a755db95a0043caeff668ee19 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/lj.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/myself.mp3 b/examples/various/gatsby_and_so_we_beat_on/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cedc6e18ef0ac4a04745c15a836c5c2b52679d55 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/myself.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/pat.mp3 b/examples/various/gatsby_and_so_we_beat_on/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3a3a4b261e8319c91941e1267d7e0cc7769c4f46 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/pat.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/snakes.mp3 b/examples/various/gatsby_and_so_we_beat_on/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fe052f943e310f01ebc548b7555b349bd69fc254 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/snakes.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/tom.mp3 b/examples/various/gatsby_and_so_we_beat_on/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6892369e5ed6303e33e9da29631ce905908a52b2 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/tom.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/train_atkins.mp3 b/examples/various/gatsby_and_so_we_beat_on/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..974a0e43c6d99b32812589d33ab7e1385f1cc3ae Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/train_atkins.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/train_dotrice.mp3 b/examples/various/gatsby_and_so_we_beat_on/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..788df4ea1d5be98eb9841dec1b957bdf588856fb Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/train_dotrice.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/train_kennard.mp3 b/examples/various/gatsby_and_so_we_beat_on/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..888f83a8355c92237172c5280403bc88b12df5f5 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/train_kennard.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/weaver.mp3 b/examples/various/gatsby_and_so_we_beat_on/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3cd9f5b576d43cd66869a69d38437d44f23b17d9 Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/weaver.mp3 differ diff --git a/examples/various/gatsby_and_so_we_beat_on/william.mp3 b/examples/various/gatsby_and_so_we_beat_on/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..689a5c850793962c80bbdc18fbb105827607921d Binary files /dev/null and b/examples/various/gatsby_and_so_we_beat_on/william.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/angie.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6eb50f11bc47e0c8cd25a79fdcacd50c98c3e7cb Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/angie.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/daniel.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3429af991cec6d828021f35b0367f5cf4f3744fa Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/daniel.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/deniro.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..eacbe7fc39968d2cc6e5c4fe1cd2bf2a8a089ffd Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/deniro.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/emma.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a2e224ec9839f5bc6c2eab6de2d4f7625d3b500f Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/emma.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/freeman.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3925eeb48425b25d946be11de196a09ed0970f53 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/freeman.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/geralt.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..38852f8552a47260ba35ff1ab1801a9ea25b4ed3 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/geralt.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/grace_train.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d551a05bb1f2401515688cf288cdcc27f11f2174 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/grace_train.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/halle.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1587002679357c17d1ba92b795bfbe1cad872379 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/halle.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/jlaw.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bd83fb68643e2288a6fdffb02c04c133542f353f Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/jlaw.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/lj.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..38e450ed02508f9bc7d9b6006dc932d996a9edc6 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/lj.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/myself.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b549860eefa04cf52c604186f859e76a569ec177 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/myself.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/pat.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..42156b4c05df7817e9b9130397172996c910aef4 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/pat.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/snakes.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1e44b7506e303a772a686026beba134ade901ec3 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/snakes.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/tom.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8bc7c2b848a77312d0908154f93f3c388685f0c8 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/tom.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/train_atkins.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..32bf707174c2ec3df49f2cf2965880b6e37b7275 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/train_atkins.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/train_dotrice.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f33d37dbf37512284b3e4a0afdccc4cc8385262c Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/train_dotrice.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/train_kennard.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..11b9d14776f24eecb0687aa3bac9c5348055f3c2 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/train_kennard.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/weaver.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..95ce0aea400fe158f00253bf33602dc2ea0b6060 Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/weaver.mp3 differ diff --git a/examples/various/harrypotter_differences_of_habit_and_language/william.mp3 b/examples/various/harrypotter_differences_of_habit_and_language/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..11e87ac9669fcd3ff1a3570fd7e500e109d3126e Binary files /dev/null and b/examples/various/harrypotter_differences_of_habit_and_language/william.mp3 differ diff --git a/examples/various/i_am_a_language_model/angie.mp3 b/examples/various/i_am_a_language_model/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..572e39d64196de57447feae3a92dd8bc3b2a0b86 Binary files /dev/null and b/examples/various/i_am_a_language_model/angie.mp3 differ diff --git a/examples/various/i_am_a_language_model/daniel.mp3 b/examples/various/i_am_a_language_model/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9064455152984b77d23154edbdaf585ceb6c0b3b Binary files /dev/null and b/examples/various/i_am_a_language_model/daniel.mp3 differ diff --git a/examples/various/i_am_a_language_model/deniro.mp3 b/examples/various/i_am_a_language_model/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3ff97bb5eefcf5e297ac21d4325e2f0c2d0868ae Binary files /dev/null and b/examples/various/i_am_a_language_model/deniro.mp3 differ diff --git a/examples/various/i_am_a_language_model/emma.mp3 b/examples/various/i_am_a_language_model/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8108afc19846235a2254cac9cf024d818c87a9aa Binary files /dev/null and b/examples/various/i_am_a_language_model/emma.mp3 differ diff --git a/examples/various/i_am_a_language_model/freeman.mp3 b/examples/various/i_am_a_language_model/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..25773345c06e7249e4be8d0ab62d39f4d4577f9b Binary files /dev/null and b/examples/various/i_am_a_language_model/freeman.mp3 differ diff --git a/examples/various/i_am_a_language_model/geralt.mp3 b/examples/various/i_am_a_language_model/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..632a47cfb31da522fcccc77321aeb15caebd8530 Binary files /dev/null and b/examples/various/i_am_a_language_model/geralt.mp3 differ diff --git a/examples/various/i_am_a_language_model/grace_train.mp3 b/examples/various/i_am_a_language_model/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cf2744cb8c85ff3137dd2c363343879552ff8b2a Binary files /dev/null and b/examples/various/i_am_a_language_model/grace_train.mp3 differ diff --git a/examples/various/i_am_a_language_model/halle.mp3 b/examples/various/i_am_a_language_model/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e519ad6e2932ebbf0b7479de881c3be5db70bf91 Binary files /dev/null and b/examples/various/i_am_a_language_model/halle.mp3 differ diff --git a/examples/various/i_am_a_language_model/jlaw.mp3 b/examples/various/i_am_a_language_model/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c9368b03cbbe7e6d2c35c173739e51dac42bf09c Binary files /dev/null and b/examples/various/i_am_a_language_model/jlaw.mp3 differ diff --git a/examples/various/i_am_a_language_model/lj.mp3 b/examples/various/i_am_a_language_model/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7aaa2665ba104ef6a1385cd826e45130c93553c2 Binary files /dev/null and b/examples/various/i_am_a_language_model/lj.mp3 differ diff --git a/examples/various/i_am_a_language_model/myself.mp3 b/examples/various/i_am_a_language_model/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cda991ced66145c8e9b09949fe7030a94a55c42f Binary files /dev/null and b/examples/various/i_am_a_language_model/myself.mp3 differ diff --git a/examples/various/i_am_a_language_model/pat.mp3 b/examples/various/i_am_a_language_model/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f3657b67ff27c33f4b9adb6266d784626b9a5b08 Binary files /dev/null and b/examples/various/i_am_a_language_model/pat.mp3 differ diff --git a/examples/various/i_am_a_language_model/snakes.mp3 b/examples/various/i_am_a_language_model/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..53a3c326323a309222597a27d4b05cfc84fcf328 Binary files /dev/null and b/examples/various/i_am_a_language_model/snakes.mp3 differ diff --git a/examples/various/i_am_a_language_model/tom.mp3 b/examples/various/i_am_a_language_model/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..72475c0205061ef214b9185b3ccd15ef2faf7075 Binary files /dev/null and b/examples/various/i_am_a_language_model/tom.mp3 differ diff --git a/examples/various/i_am_a_language_model/train_atkins.mp3 b/examples/various/i_am_a_language_model/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5640532fde51ce0d744e012e6a733a9f4f4ab1e0 Binary files /dev/null and b/examples/various/i_am_a_language_model/train_atkins.mp3 differ diff --git a/examples/various/i_am_a_language_model/train_dotrice.mp3 b/examples/various/i_am_a_language_model/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..699eac3eef3943fd0771f7e2346537d538069524 Binary files /dev/null and b/examples/various/i_am_a_language_model/train_dotrice.mp3 differ diff --git a/examples/various/i_am_a_language_model/train_kennard.mp3 b/examples/various/i_am_a_language_model/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..05e0b5033cc3d645399f2eadc2db9d5765b7b738 Binary files /dev/null and b/examples/various/i_am_a_language_model/train_kennard.mp3 differ diff --git a/examples/various/i_am_a_language_model/weaver.mp3 b/examples/various/i_am_a_language_model/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..29cc6f1db473affb8586aaf87e5962c5dd5cdc14 Binary files /dev/null and b/examples/various/i_am_a_language_model/weaver.mp3 differ diff --git a/examples/various/i_am_a_language_model/william.mp3 b/examples/various/i_am_a_language_model/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7a534143c7d09ec3f3936c7d49478962702fa75b Binary files /dev/null and b/examples/various/i_am_a_language_model/william.mp3 differ diff --git a/examples/various/melodie_kao/angie.mp3 b/examples/various/melodie_kao/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..89fcd9f8bf533f78e035b50e505ef5f4573ff799 Binary files /dev/null and b/examples/various/melodie_kao/angie.mp3 differ diff --git a/examples/various/melodie_kao/daniel.mp3 b/examples/various/melodie_kao/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..86cff5ec960faf5c156a0b344ac794f76c56495e Binary files /dev/null and b/examples/various/melodie_kao/daniel.mp3 differ diff --git a/examples/various/melodie_kao/deniro.mp3 b/examples/various/melodie_kao/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..08bf099269757cd6a58f8bef550f9f1a05fdf117 Binary files /dev/null and b/examples/various/melodie_kao/deniro.mp3 differ diff --git a/examples/various/melodie_kao/emma.mp3 b/examples/various/melodie_kao/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..53fb044974dea4dba39fdf57876df3d68ad1e23a Binary files /dev/null and b/examples/various/melodie_kao/emma.mp3 differ diff --git a/examples/various/melodie_kao/freeman.mp3 b/examples/various/melodie_kao/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2217e73679529691d9b54dda8b96b757e32e449b Binary files /dev/null and b/examples/various/melodie_kao/freeman.mp3 differ diff --git a/examples/various/melodie_kao/geralt.mp3 b/examples/various/melodie_kao/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..545aa5442c731d188ed4aa29c33db0bd5b413a3c Binary files /dev/null and b/examples/various/melodie_kao/geralt.mp3 differ diff --git a/examples/various/melodie_kao/grace_train.mp3 b/examples/various/melodie_kao/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..812137a327ecdc7791fe3bdc964656ad1ad0400e Binary files /dev/null and b/examples/various/melodie_kao/grace_train.mp3 differ diff --git a/examples/various/melodie_kao/halle.mp3 b/examples/various/melodie_kao/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..edfc5a088d45d8517d5fcd11d9d79cb7df00ed05 Binary files /dev/null and b/examples/various/melodie_kao/halle.mp3 differ diff --git a/examples/various/melodie_kao/jlaw.mp3 b/examples/various/melodie_kao/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0c7022800b4e6cee497220c249cc435ce6a71704 Binary files /dev/null and b/examples/various/melodie_kao/jlaw.mp3 differ diff --git a/examples/various/melodie_kao/lj.mp3 b/examples/various/melodie_kao/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..475eadc6004acabfbe3baad711fa501a82732bfd Binary files /dev/null and b/examples/various/melodie_kao/lj.mp3 differ diff --git a/examples/various/melodie_kao/myself.mp3 b/examples/various/melodie_kao/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7fdebfc8067288f655bae9fb6bdd5228408442f9 Binary files /dev/null and b/examples/various/melodie_kao/myself.mp3 differ diff --git a/examples/various/melodie_kao/pat.mp3 b/examples/various/melodie_kao/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fd9f7818b464592053077e69a4dcb1bb92f12b0a Binary files /dev/null and b/examples/various/melodie_kao/pat.mp3 differ diff --git a/examples/various/melodie_kao/snakes.mp3 b/examples/various/melodie_kao/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..69e65717e83ec280cca211d5da7eb2a58b47431c Binary files /dev/null and b/examples/various/melodie_kao/snakes.mp3 differ diff --git a/examples/various/melodie_kao/tom.mp3 b/examples/various/melodie_kao/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..454c272d916dcbad615ee13b597046d5f01342bc Binary files /dev/null and b/examples/various/melodie_kao/tom.mp3 differ diff --git a/examples/various/melodie_kao/train_atkins.mp3 b/examples/various/melodie_kao/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a45127420a333567f448f881f88dc5c09374108a Binary files /dev/null and b/examples/various/melodie_kao/train_atkins.mp3 differ diff --git a/examples/various/melodie_kao/train_dotrice.mp3 b/examples/various/melodie_kao/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e7a79841141e6f0f227fb3eefcf065b5b9292be6 Binary files /dev/null and b/examples/various/melodie_kao/train_dotrice.mp3 differ diff --git a/examples/various/melodie_kao/train_kennard.mp3 b/examples/various/melodie_kao/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..17d76cde75f550a5fd90f465cd5f48040d41c7fd Binary files /dev/null and b/examples/various/melodie_kao/train_kennard.mp3 differ diff --git a/examples/various/melodie_kao/weaver.mp3 b/examples/various/melodie_kao/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b821249e486c7736347bb5a485f765a6d2c06a59 Binary files /dev/null and b/examples/various/melodie_kao/weaver.mp3 differ diff --git a/examples/various/melodie_kao/william.mp3 b/examples/various/melodie_kao/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d80d00af0bc2885b769a4dd89deb5450504e940d Binary files /dev/null and b/examples/various/melodie_kao/william.mp3 differ diff --git a/examples/various/nyt_covid/angie.mp3 b/examples/various/nyt_covid/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..59780356133e8867c3f9c09d0ef4b03756860b94 Binary files /dev/null and b/examples/various/nyt_covid/angie.mp3 differ diff --git a/examples/various/nyt_covid/daniel.mp3 b/examples/various/nyt_covid/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fadda4f42311701cbb6ac0fc07b397f01b55017a Binary files /dev/null and b/examples/various/nyt_covid/daniel.mp3 differ diff --git a/examples/various/nyt_covid/deniro.mp3 b/examples/various/nyt_covid/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..db58f734c26346322541ffc5bd634d47a345d142 Binary files /dev/null and b/examples/various/nyt_covid/deniro.mp3 differ diff --git a/examples/various/nyt_covid/emma.mp3 b/examples/various/nyt_covid/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1623cc1356fa18f98055c8df1394889ec4593a89 Binary files /dev/null and b/examples/various/nyt_covid/emma.mp3 differ diff --git a/examples/various/nyt_covid/freeman.mp3 b/examples/various/nyt_covid/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..76b1e9051986dc29ee2ed7b91e718323f5890d99 Binary files /dev/null and b/examples/various/nyt_covid/freeman.mp3 differ diff --git a/examples/various/nyt_covid/geralt.mp3 b/examples/various/nyt_covid/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..306e06d8622285cf3db2f85ab2db051b87823d46 Binary files /dev/null and b/examples/various/nyt_covid/geralt.mp3 differ diff --git a/examples/various/nyt_covid/grace_train.mp3 b/examples/various/nyt_covid/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bd666d8916020f2b9dd2848ced39cbf0422c63be Binary files /dev/null and b/examples/various/nyt_covid/grace_train.mp3 differ diff --git a/examples/various/nyt_covid/halle.mp3 b/examples/various/nyt_covid/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1b7abe49805abf3dd3363e4fd7f2c31dbc9b448f Binary files /dev/null and b/examples/various/nyt_covid/halle.mp3 differ diff --git a/examples/various/nyt_covid/jlaw.mp3 b/examples/various/nyt_covid/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bc31e3a38651d05bc13a88d66366a786b44c4f88 Binary files /dev/null and b/examples/various/nyt_covid/jlaw.mp3 differ diff --git a/examples/various/nyt_covid/lj.mp3 b/examples/various/nyt_covid/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..763580ea471c5d293978aae200d2ed4ac930b964 Binary files /dev/null and b/examples/various/nyt_covid/lj.mp3 differ diff --git a/examples/various/nyt_covid/myself.mp3 b/examples/various/nyt_covid/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..11d6758e1edd7c28837f760da195b786ff994539 Binary files /dev/null and b/examples/various/nyt_covid/myself.mp3 differ diff --git a/examples/various/nyt_covid/pat.mp3 b/examples/various/nyt_covid/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f3cd7240515cd0e7d81644974e5349e6cd529c15 Binary files /dev/null and b/examples/various/nyt_covid/pat.mp3 differ diff --git a/examples/various/nyt_covid/snakes.mp3 b/examples/various/nyt_covid/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d5804ac2b41042885b90c579b7076c298ec45f74 Binary files /dev/null and b/examples/various/nyt_covid/snakes.mp3 differ diff --git a/examples/various/nyt_covid/tom.mp3 b/examples/various/nyt_covid/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6272a11f7e37bcd919b1f8ece34bab84aedb0ec0 Binary files /dev/null and b/examples/various/nyt_covid/tom.mp3 differ diff --git a/examples/various/nyt_covid/train_atkins.mp3 b/examples/various/nyt_covid/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f105eb07663e304d153b4406eeadd1bb060e1872 Binary files /dev/null and b/examples/various/nyt_covid/train_atkins.mp3 differ diff --git a/examples/various/nyt_covid/train_dotrice.mp3 b/examples/various/nyt_covid/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..472fd08aa27cbee10771e2165e556a3a2c60193b Binary files /dev/null and b/examples/various/nyt_covid/train_dotrice.mp3 differ diff --git a/examples/various/nyt_covid/train_kennard.mp3 b/examples/various/nyt_covid/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d8ef3bfd01cb6fdc682bf5d03efa554f92c1228f Binary files /dev/null and b/examples/various/nyt_covid/train_kennard.mp3 differ diff --git a/examples/various/nyt_covid/weaver.mp3 b/examples/various/nyt_covid/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9928ac9a78e9014877e5822fafd409f7014485af Binary files /dev/null and b/examples/various/nyt_covid/weaver.mp3 differ diff --git a/examples/various/nyt_covid/william.mp3 b/examples/various/nyt_covid/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7c16ef19ac822ffd4e8496cd3fef3f9e0d44ca1a Binary files /dev/null and b/examples/various/nyt_covid/william.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/angie.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d34f342525ca5d39949f17a35f3ec00332ebe105 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/angie.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/daniel.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..73eae78745113a137684258912dd3b9c4e990387 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/daniel.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/deniro.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e4cdf81c9106f662f16a0714f99d04e68421180c Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/deniro.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/emma.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5b86d18b8902499570a9c23cb6b0a3163784cfe5 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/emma.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/freeman.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f7e7eb529cd065550b311c71b7a12f19e3152f78 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/freeman.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/geralt.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4f3246053b8ecc70c6909be58ec885e220b80130 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/geralt.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/grace_train.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..dbb8b926f1862a9248e4e5721c2920a22ce78488 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/grace_train.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/halle.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..144ad673917f065516651663bcf6f11744e1deba Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/halle.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/jlaw.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..34544a4db7e3b474143ca71e9e94a09ec4e34d8e Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/jlaw.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/lj.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6bd286d88d8b5316e5cdabc1e4b8469bf767d104 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/lj.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/myself.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ca52c14346b3b561fdb66114bf5322836d0d915f Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/myself.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/pat.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..092ec34200ab94a3bb7f997af6c534df06e78576 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/pat.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/snakes.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3408367b38f59d0071e618d6c6f61109c1bcbbe0 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/snakes.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/tom.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3c70a0416504035e210171f376485f0e767e9bcd Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/tom.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/train_atkins.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..12fa5f932de2f80bf1cb30ba87e5f3ac01d620d7 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/train_atkins.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/train_dotrice.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bdbd680b7160f5f40704714993c95bad20a2440b Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/train_dotrice.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/train_kennard.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..20e2b687a84496b830fab064a96c4ebb76f43aab Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/train_kennard.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/weaver.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1e15bbde84c74e67fea1fae9a49337ad969fb176 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/weaver.mp3 differ diff --git a/examples/various/real_courage_is_when_you_know_your_licked/william.mp3 b/examples/various/real_courage_is_when_you_know_your_licked/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..47c65bf62b3b83fd1279456de07d46c66e312867 Binary files /dev/null and b/examples/various/real_courage_is_when_you_know_your_licked/william.mp3 differ diff --git a/examples/various/rolling_stone_review/angie.mp3 b/examples/various/rolling_stone_review/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0c7ece7901159abf3989888c670dd291d8ec59eb Binary files /dev/null and b/examples/various/rolling_stone_review/angie.mp3 differ diff --git a/examples/various/rolling_stone_review/daniel.mp3 b/examples/various/rolling_stone_review/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..73c79d2af3a4ea6db3ec7084c2d0332a96b9b3ff Binary files /dev/null and b/examples/various/rolling_stone_review/daniel.mp3 differ diff --git a/examples/various/rolling_stone_review/deniro.mp3 b/examples/various/rolling_stone_review/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5c1a8960a3d0bfcc62dcc88b43ef671f480dba95 Binary files /dev/null and b/examples/various/rolling_stone_review/deniro.mp3 differ diff --git a/examples/various/rolling_stone_review/emma.mp3 b/examples/various/rolling_stone_review/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9276730e44176e3c43074744402da0efb2df9bf9 Binary files /dev/null and b/examples/various/rolling_stone_review/emma.mp3 differ diff --git a/examples/various/rolling_stone_review/freeman.mp3 b/examples/various/rolling_stone_review/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8d342a73c29a37d67ebf731f83d8fe0c078653f5 Binary files /dev/null and b/examples/various/rolling_stone_review/freeman.mp3 differ diff --git a/examples/various/rolling_stone_review/geralt.mp3 b/examples/various/rolling_stone_review/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b2f7153be012949e6470bd75d692fe4396128e40 Binary files /dev/null and b/examples/various/rolling_stone_review/geralt.mp3 differ diff --git a/examples/various/rolling_stone_review/grace_train.mp3 b/examples/various/rolling_stone_review/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..653ce4a8a6ec14d3a9c548bf045c7039d5b79833 Binary files /dev/null and b/examples/various/rolling_stone_review/grace_train.mp3 differ diff --git a/examples/various/rolling_stone_review/halle.mp3 b/examples/various/rolling_stone_review/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c647110c9bc436b2830a11455d5064782afb6068 Binary files /dev/null and b/examples/various/rolling_stone_review/halle.mp3 differ diff --git a/examples/various/rolling_stone_review/jlaw.mp3 b/examples/various/rolling_stone_review/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7bdb6bfedfafd89fc9fcd855687267b53a8f1fe5 Binary files /dev/null and b/examples/various/rolling_stone_review/jlaw.mp3 differ diff --git a/examples/various/rolling_stone_review/lj.mp3 b/examples/various/rolling_stone_review/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1d90b0e5bfc58301d23919fdce12da276c13572b Binary files /dev/null and b/examples/various/rolling_stone_review/lj.mp3 differ diff --git a/examples/various/rolling_stone_review/myself.mp3 b/examples/various/rolling_stone_review/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7ebc12d7b027e63d74eb670c6d983dc6c62d4593 Binary files /dev/null and b/examples/various/rolling_stone_review/myself.mp3 differ diff --git a/examples/various/rolling_stone_review/pat.mp3 b/examples/various/rolling_stone_review/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bb49db878ecb1fa7ebafc7ea01be398ef38b839f Binary files /dev/null and b/examples/various/rolling_stone_review/pat.mp3 differ diff --git a/examples/various/rolling_stone_review/snakes.mp3 b/examples/various/rolling_stone_review/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8a190f478c2c2403bc8f8189b33f7578c913185b Binary files /dev/null and b/examples/various/rolling_stone_review/snakes.mp3 differ diff --git a/examples/various/rolling_stone_review/tom.mp3 b/examples/various/rolling_stone_review/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0b13318ce9508234ffcdc9d2dcb24381c626d07d Binary files /dev/null and b/examples/various/rolling_stone_review/tom.mp3 differ diff --git a/examples/various/rolling_stone_review/train_atkins.mp3 b/examples/various/rolling_stone_review/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4d8698e478dd3a6727194c74ff4318ba116ca729 Binary files /dev/null and b/examples/various/rolling_stone_review/train_atkins.mp3 differ diff --git a/examples/various/rolling_stone_review/train_dotrice.mp3 b/examples/various/rolling_stone_review/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6b1dca927fd75943461fd40a91a38d0d3aeec4e0 Binary files /dev/null and b/examples/various/rolling_stone_review/train_dotrice.mp3 differ diff --git a/examples/various/rolling_stone_review/train_kennard.mp3 b/examples/various/rolling_stone_review/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..74c5760d2c8e4de07cc3e211661e7a7604d6fcbf Binary files /dev/null and b/examples/various/rolling_stone_review/train_kennard.mp3 differ diff --git a/examples/various/rolling_stone_review/weaver.mp3 b/examples/various/rolling_stone_review/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..11041eeb33570c0b1b28798ed16f0a3f162a3523 Binary files /dev/null and b/examples/various/rolling_stone_review/weaver.mp3 differ diff --git a/examples/various/rolling_stone_review/william.mp3 b/examples/various/rolling_stone_review/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ad42631a992aeb1b8ddef12d9c984728b89563f4 Binary files /dev/null and b/examples/various/rolling_stone_review/william.mp3 differ diff --git a/examples/various/spacecraft_interview/angie.mp3 b/examples/various/spacecraft_interview/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d8e6b1fef15fc11ff943dec33446cc43a826fd08 Binary files /dev/null and b/examples/various/spacecraft_interview/angie.mp3 differ diff --git a/examples/various/spacecraft_interview/daniel.mp3 b/examples/various/spacecraft_interview/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..30688d5147fbb3404b0a5f5d0971ecb952550408 Binary files /dev/null and b/examples/various/spacecraft_interview/daniel.mp3 differ diff --git a/examples/various/spacecraft_interview/deniro.mp3 b/examples/various/spacecraft_interview/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d634bf0731fb3a58788010a2946676095259b4ee Binary files /dev/null and b/examples/various/spacecraft_interview/deniro.mp3 differ diff --git a/examples/various/spacecraft_interview/emma.mp3 b/examples/various/spacecraft_interview/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a1c1078277ca87e83ffff335022b52f0eed3063a Binary files /dev/null and b/examples/various/spacecraft_interview/emma.mp3 differ diff --git a/examples/various/spacecraft_interview/freeman.mp3 b/examples/various/spacecraft_interview/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bd4dca57895189766f29ed1d8c9ed259f2a04d42 Binary files /dev/null and b/examples/various/spacecraft_interview/freeman.mp3 differ diff --git a/examples/various/spacecraft_interview/geralt.mp3 b/examples/various/spacecraft_interview/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..84175b5a14e81c2597a5b33a110c562cb282f767 Binary files /dev/null and b/examples/various/spacecraft_interview/geralt.mp3 differ diff --git a/examples/various/spacecraft_interview/grace_train.mp3 b/examples/various/spacecraft_interview/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..53f9364b612290a83efc8ac15b135e717a404ea0 Binary files /dev/null and b/examples/various/spacecraft_interview/grace_train.mp3 differ diff --git a/examples/various/spacecraft_interview/halle.mp3 b/examples/various/spacecraft_interview/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e0ff6d54b295b73f76aabf18dd64252847223f45 Binary files /dev/null and b/examples/various/spacecraft_interview/halle.mp3 differ diff --git a/examples/various/spacecraft_interview/jlaw.mp3 b/examples/various/spacecraft_interview/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..83e332739c9e15a8d22760419b1d8e7fc4517c80 Binary files /dev/null and b/examples/various/spacecraft_interview/jlaw.mp3 differ diff --git a/examples/various/spacecraft_interview/lj.mp3 b/examples/various/spacecraft_interview/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8d1ea9400253aa353fb097b5fac4301b21053bec Binary files /dev/null and b/examples/various/spacecraft_interview/lj.mp3 differ diff --git a/examples/various/spacecraft_interview/myself.mp3 b/examples/various/spacecraft_interview/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d295b68877fb96007a49fa0f5e51c68472a63ac9 Binary files /dev/null and b/examples/various/spacecraft_interview/myself.mp3 differ diff --git a/examples/various/spacecraft_interview/pat.mp3 b/examples/various/spacecraft_interview/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..be5098de77fbb3ca9a7b174b91f99a322bf58a9e Binary files /dev/null and b/examples/various/spacecraft_interview/pat.mp3 differ diff --git a/examples/various/spacecraft_interview/snakes.mp3 b/examples/various/spacecraft_interview/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2427a4515aa94dce6d023a843ab095f7feb648a0 Binary files /dev/null and b/examples/various/spacecraft_interview/snakes.mp3 differ diff --git a/examples/various/spacecraft_interview/tom.mp3 b/examples/various/spacecraft_interview/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bbde1474ec880306f2fc829e059397812cb15d15 Binary files /dev/null and b/examples/various/spacecraft_interview/tom.mp3 differ diff --git a/examples/various/spacecraft_interview/train_atkins.mp3 b/examples/various/spacecraft_interview/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3e316f198eb9da0c436a11dc6a4b0be50cc5d826 Binary files /dev/null and b/examples/various/spacecraft_interview/train_atkins.mp3 differ diff --git a/examples/various/spacecraft_interview/train_dotrice.mp3 b/examples/various/spacecraft_interview/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4547c5dccfc7ba60d861701d66d3d60715f88834 Binary files /dev/null and b/examples/various/spacecraft_interview/train_dotrice.mp3 differ diff --git a/examples/various/spacecraft_interview/train_kennard.mp3 b/examples/various/spacecraft_interview/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0acf839c819bd18139d444e83f901be40f26b0eb Binary files /dev/null and b/examples/various/spacecraft_interview/train_kennard.mp3 differ diff --git a/examples/various/spacecraft_interview/weaver.mp3 b/examples/various/spacecraft_interview/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6937b9160029b96ed2b0c8934504c49dda560ce3 Binary files /dev/null and b/examples/various/spacecraft_interview/weaver.mp3 differ diff --git a/examples/various/spacecraft_interview/william.mp3 b/examples/various/spacecraft_interview/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cdd7e07fef67c32c65576d354a09ca8dafe67e77 Binary files /dev/null and b/examples/various/spacecraft_interview/william.mp3 differ diff --git a/examples/various/tacotron2_sample1/angie.mp3 b/examples/various/tacotron2_sample1/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..69faecff642a932e9a298bc64b13eb78a654b46e Binary files /dev/null and b/examples/various/tacotron2_sample1/angie.mp3 differ diff --git a/examples/various/tacotron2_sample1/daniel.mp3 b/examples/various/tacotron2_sample1/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5b202685eb8233e74ccc5c0e01e2b89e4775fd1a Binary files /dev/null and b/examples/various/tacotron2_sample1/daniel.mp3 differ diff --git a/examples/various/tacotron2_sample1/deniro.mp3 b/examples/various/tacotron2_sample1/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8da37ffeaff8864ab61a45508c4e40199c5e7735 Binary files /dev/null and b/examples/various/tacotron2_sample1/deniro.mp3 differ diff --git a/examples/various/tacotron2_sample1/emma.mp3 b/examples/various/tacotron2_sample1/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e881857453cc94a61e1a51df6494449b61d427f8 Binary files /dev/null and b/examples/various/tacotron2_sample1/emma.mp3 differ diff --git a/examples/various/tacotron2_sample1/freeman.mp3 b/examples/various/tacotron2_sample1/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0d3653ea72ac16549b68c56d3981ce0126157861 Binary files /dev/null and b/examples/various/tacotron2_sample1/freeman.mp3 differ diff --git a/examples/various/tacotron2_sample1/geralt.mp3 b/examples/various/tacotron2_sample1/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9160da81674759d938d905c7c9b4007f253c4626 Binary files /dev/null and b/examples/various/tacotron2_sample1/geralt.mp3 differ diff --git a/examples/various/tacotron2_sample1/grace_train.mp3 b/examples/various/tacotron2_sample1/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4891ce33180886d0dec8ef1fcb8124147d10bbb5 Binary files /dev/null and b/examples/various/tacotron2_sample1/grace_train.mp3 differ diff --git a/examples/various/tacotron2_sample1/halle.mp3 b/examples/various/tacotron2_sample1/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..dcbbcf3af69d8969334ec284b4eca0c656270b14 Binary files /dev/null and b/examples/various/tacotron2_sample1/halle.mp3 differ diff --git a/examples/various/tacotron2_sample1/jlaw.mp3 b/examples/various/tacotron2_sample1/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c9dbbc3da71deb65432e7cd4913ea2ae1c6567e9 Binary files /dev/null and b/examples/various/tacotron2_sample1/jlaw.mp3 differ diff --git a/examples/various/tacotron2_sample1/lj.mp3 b/examples/various/tacotron2_sample1/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7dc0f49a0344118ae7207a3f715d0d4c49f48319 Binary files /dev/null and b/examples/various/tacotron2_sample1/lj.mp3 differ diff --git a/examples/various/tacotron2_sample1/myself.mp3 b/examples/various/tacotron2_sample1/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..291394e66df7af2bffb3725d51e9e92aa02d43d0 Binary files /dev/null and b/examples/various/tacotron2_sample1/myself.mp3 differ diff --git a/examples/various/tacotron2_sample1/pat.mp3 b/examples/various/tacotron2_sample1/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ee5ec06b8e3de26a0c6f36af1fd134bf951dcffe Binary files /dev/null and b/examples/various/tacotron2_sample1/pat.mp3 differ diff --git a/examples/various/tacotron2_sample1/snakes.mp3 b/examples/various/tacotron2_sample1/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7ebbb849a60e9f8a2bd11f4f9bec1567a41af907 Binary files /dev/null and b/examples/various/tacotron2_sample1/snakes.mp3 differ diff --git a/examples/various/tacotron2_sample1/tom.mp3 b/examples/various/tacotron2_sample1/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6bc369d7324cb429bf86f1dbb5f2f522cc0e7454 Binary files /dev/null and b/examples/various/tacotron2_sample1/tom.mp3 differ diff --git a/examples/various/tacotron2_sample1/train_atkins.mp3 b/examples/various/tacotron2_sample1/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9cee843c3466ea5a8a81d48d36171c6fc0dac298 Binary files /dev/null and b/examples/various/tacotron2_sample1/train_atkins.mp3 differ diff --git a/examples/various/tacotron2_sample1/train_dotrice.mp3 b/examples/various/tacotron2_sample1/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5159064bc38270ffde94e7baaad1b6c501cc7bb1 Binary files /dev/null and b/examples/various/tacotron2_sample1/train_dotrice.mp3 differ diff --git a/examples/various/tacotron2_sample1/train_kennard.mp3 b/examples/various/tacotron2_sample1/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..dc355544c2deeb9a6c1c7fd20ba4f378e6368d7a Binary files /dev/null and b/examples/various/tacotron2_sample1/train_kennard.mp3 differ diff --git a/examples/various/tacotron2_sample1/weaver.mp3 b/examples/various/tacotron2_sample1/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e115a74e2bb407cf4cb3bedf7d3578683b90264e Binary files /dev/null and b/examples/various/tacotron2_sample1/weaver.mp3 differ diff --git a/examples/various/tacotron2_sample1/william.mp3 b/examples/various/tacotron2_sample1/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b42a8eeb316378484006e3848f568b4daa8997fc Binary files /dev/null and b/examples/various/tacotron2_sample1/william.mp3 differ diff --git a/examples/various/tacotron2_sample2/angie.mp3 b/examples/various/tacotron2_sample2/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b173019b50e65371912e540f3dd0bc6e29df804a Binary files /dev/null and b/examples/various/tacotron2_sample2/angie.mp3 differ diff --git a/examples/various/tacotron2_sample2/daniel.mp3 b/examples/various/tacotron2_sample2/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..59a5a025111ef4c041596ea5269d0d62ecfb5c81 Binary files /dev/null and b/examples/various/tacotron2_sample2/daniel.mp3 differ diff --git a/examples/various/tacotron2_sample2/deniro.mp3 b/examples/various/tacotron2_sample2/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b03a248a9ba67c78be344d77799f43e31f186de4 Binary files /dev/null and b/examples/various/tacotron2_sample2/deniro.mp3 differ diff --git a/examples/various/tacotron2_sample2/emma.mp3 b/examples/various/tacotron2_sample2/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..37053d96cb668beff00e467710d626b7f01ae2ec Binary files /dev/null and b/examples/various/tacotron2_sample2/emma.mp3 differ diff --git a/examples/various/tacotron2_sample2/freeman.mp3 b/examples/various/tacotron2_sample2/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..47a3fff1158e736d1db81ee21dfc8c3d782d72fa Binary files /dev/null and b/examples/various/tacotron2_sample2/freeman.mp3 differ diff --git a/examples/various/tacotron2_sample2/geralt.mp3 b/examples/various/tacotron2_sample2/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5fa80e50823bc39acb16526dc9c4a2a037a2feb8 Binary files /dev/null and b/examples/various/tacotron2_sample2/geralt.mp3 differ diff --git a/examples/various/tacotron2_sample2/grace_train.mp3 b/examples/various/tacotron2_sample2/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..60cbade876d7f7aea25edcad881047fd850341e1 Binary files /dev/null and b/examples/various/tacotron2_sample2/grace_train.mp3 differ diff --git a/examples/various/tacotron2_sample2/halle.mp3 b/examples/various/tacotron2_sample2/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2bbf8a9c0360331f68a4e2ea1f75f8e25121ae45 Binary files /dev/null and b/examples/various/tacotron2_sample2/halle.mp3 differ diff --git a/examples/various/tacotron2_sample2/jlaw.mp3 b/examples/various/tacotron2_sample2/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3239dab81148abdd7934e1591dcb8bc5923e91aa Binary files /dev/null and b/examples/various/tacotron2_sample2/jlaw.mp3 differ diff --git a/examples/various/tacotron2_sample2/lj.mp3 b/examples/various/tacotron2_sample2/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..18cfbd7162356d1613d8db327d8449140fca818c Binary files /dev/null and b/examples/various/tacotron2_sample2/lj.mp3 differ diff --git a/examples/various/tacotron2_sample2/myself.mp3 b/examples/various/tacotron2_sample2/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fc00f74dbdf53dfe00c29c369a2e8776d99e4fe9 Binary files /dev/null and b/examples/various/tacotron2_sample2/myself.mp3 differ diff --git a/examples/various/tacotron2_sample2/pat.mp3 b/examples/various/tacotron2_sample2/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fc6bea6ab41d9a9c90a580f2f0b3edba1dd8dd07 Binary files /dev/null and b/examples/various/tacotron2_sample2/pat.mp3 differ diff --git a/examples/various/tacotron2_sample2/snakes.mp3 b/examples/various/tacotron2_sample2/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7bf2b5a95daccaa1bedfb8dd9080fa72a38f6403 Binary files /dev/null and b/examples/various/tacotron2_sample2/snakes.mp3 differ diff --git a/examples/various/tacotron2_sample2/tom.mp3 b/examples/various/tacotron2_sample2/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2548ee97fd5251af73a49d70f514e12a3f0eebc3 Binary files /dev/null and b/examples/various/tacotron2_sample2/tom.mp3 differ diff --git a/examples/various/tacotron2_sample2/train_atkins.mp3 b/examples/various/tacotron2_sample2/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3f3158078d689f7f8ab0b28d8568ac937ec1d7b6 Binary files /dev/null and b/examples/various/tacotron2_sample2/train_atkins.mp3 differ diff --git a/examples/various/tacotron2_sample2/train_dotrice.mp3 b/examples/various/tacotron2_sample2/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b729e735e973c18bc5ac900811324b3a03132658 Binary files /dev/null and b/examples/various/tacotron2_sample2/train_dotrice.mp3 differ diff --git a/examples/various/tacotron2_sample2/train_kennard.mp3 b/examples/various/tacotron2_sample2/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d35f66864d3226d0add67a48def5c790725c3ec4 Binary files /dev/null and b/examples/various/tacotron2_sample2/train_kennard.mp3 differ diff --git a/examples/various/tacotron2_sample2/weaver.mp3 b/examples/various/tacotron2_sample2/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..42502d6b234f296c2af82882d512d31e4b5b7ae2 Binary files /dev/null and b/examples/various/tacotron2_sample2/weaver.mp3 differ diff --git a/examples/various/tacotron2_sample2/william.mp3 b/examples/various/tacotron2_sample2/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7f2941ac6c3abbc5e9710fb8bbe4cd1056b0caa0 Binary files /dev/null and b/examples/various/tacotron2_sample2/william.mp3 differ diff --git a/examples/various/tacotron2_sample3/angie.mp3 b/examples/various/tacotron2_sample3/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e273bf313628b0eed6d5fe445977bd4cecd75c2a Binary files /dev/null and b/examples/various/tacotron2_sample3/angie.mp3 differ diff --git a/examples/various/tacotron2_sample3/daniel.mp3 b/examples/various/tacotron2_sample3/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..060a5e0d43ec4c6541fdeb1f2e4c9977892e61cc Binary files /dev/null and b/examples/various/tacotron2_sample3/daniel.mp3 differ diff --git a/examples/various/tacotron2_sample3/deniro.mp3 b/examples/various/tacotron2_sample3/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..815293256cd172f72f1ae4ef86a14291e488fbe6 Binary files /dev/null and b/examples/various/tacotron2_sample3/deniro.mp3 differ diff --git a/examples/various/tacotron2_sample3/emma.mp3 b/examples/various/tacotron2_sample3/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9ace3f4eb2c99485aacd162d37c04a69797fc0c7 Binary files /dev/null and b/examples/various/tacotron2_sample3/emma.mp3 differ diff --git a/examples/various/tacotron2_sample3/freeman.mp3 b/examples/various/tacotron2_sample3/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..55d24d2ecb5b7f52e2613eeac14ba2e9f9c923e1 Binary files /dev/null and b/examples/various/tacotron2_sample3/freeman.mp3 differ diff --git a/examples/various/tacotron2_sample3/geralt.mp3 b/examples/various/tacotron2_sample3/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4faab91df6bf1af0754ff044c0031f1cac8ddb2e Binary files /dev/null and b/examples/various/tacotron2_sample3/geralt.mp3 differ diff --git a/examples/various/tacotron2_sample3/grace_train.mp3 b/examples/various/tacotron2_sample3/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5787e67ab38132fc957617b7deb2880a2ce5a2d4 Binary files /dev/null and b/examples/various/tacotron2_sample3/grace_train.mp3 differ diff --git a/examples/various/tacotron2_sample3/halle.mp3 b/examples/various/tacotron2_sample3/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..79a3b219ec7f9d336ea0f31e7cbbc03cb86f9d10 Binary files /dev/null and b/examples/various/tacotron2_sample3/halle.mp3 differ diff --git a/examples/various/tacotron2_sample3/jlaw.mp3 b/examples/various/tacotron2_sample3/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..12a4e1ea79700ec2d70147df461b612f81f4bca4 Binary files /dev/null and b/examples/various/tacotron2_sample3/jlaw.mp3 differ diff --git a/examples/various/tacotron2_sample3/lj.mp3 b/examples/various/tacotron2_sample3/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..73d70d2877e5bc717c16564322fa05d6c75427e3 Binary files /dev/null and b/examples/various/tacotron2_sample3/lj.mp3 differ diff --git a/examples/various/tacotron2_sample3/myself.mp3 b/examples/various/tacotron2_sample3/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..977896830ac658db2d99b537583e8ecb395cd5d5 Binary files /dev/null and b/examples/various/tacotron2_sample3/myself.mp3 differ diff --git a/examples/various/tacotron2_sample3/pat.mp3 b/examples/various/tacotron2_sample3/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..ea16bef5e4846e0f5c4b5ae8e62fec862da8a451 Binary files /dev/null and b/examples/various/tacotron2_sample3/pat.mp3 differ diff --git a/examples/various/tacotron2_sample3/snakes.mp3 b/examples/various/tacotron2_sample3/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..65af7783fb5f68383373de09252870e47745b1ab Binary files /dev/null and b/examples/various/tacotron2_sample3/snakes.mp3 differ diff --git a/examples/various/tacotron2_sample3/tom.mp3 b/examples/various/tacotron2_sample3/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a387e10f712cc52ca5cd4664bb88271ca27be6ee Binary files /dev/null and b/examples/various/tacotron2_sample3/tom.mp3 differ diff --git a/examples/various/tacotron2_sample3/train_atkins.mp3 b/examples/various/tacotron2_sample3/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..da7fe3521ecf81480fa6ad3e6a709206c88513d7 Binary files /dev/null and b/examples/various/tacotron2_sample3/train_atkins.mp3 differ diff --git a/examples/various/tacotron2_sample3/train_dotrice.mp3 b/examples/various/tacotron2_sample3/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..97de6cc0b38fc4056c34a338fbee57c38de74383 Binary files /dev/null and b/examples/various/tacotron2_sample3/train_dotrice.mp3 differ diff --git a/examples/various/tacotron2_sample3/train_kennard.mp3 b/examples/various/tacotron2_sample3/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..55803b2326dc1a2afc90c28014dd84e4915c54e5 Binary files /dev/null and b/examples/various/tacotron2_sample3/train_kennard.mp3 differ diff --git a/examples/various/tacotron2_sample3/weaver.mp3 b/examples/various/tacotron2_sample3/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c96541f63c41f9bd591c24ac207079610be23aeb Binary files /dev/null and b/examples/various/tacotron2_sample3/weaver.mp3 differ diff --git a/examples/various/tacotron2_sample3/william.mp3 b/examples/various/tacotron2_sample3/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..91713896b01bf9b5edde6c37cab519ac61edbf7b Binary files /dev/null and b/examples/various/tacotron2_sample3/william.mp3 differ diff --git a/examples/various/tacotron2_sample4/angie.mp3 b/examples/various/tacotron2_sample4/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d0a3b837f860f3925e31172f2749061fb37db75a Binary files /dev/null and b/examples/various/tacotron2_sample4/angie.mp3 differ diff --git a/examples/various/tacotron2_sample4/daniel.mp3 b/examples/various/tacotron2_sample4/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2e84c55bb0b43613f814cf09d5fcad881e3a5b02 Binary files /dev/null and b/examples/various/tacotron2_sample4/daniel.mp3 differ diff --git a/examples/various/tacotron2_sample4/deniro.mp3 b/examples/various/tacotron2_sample4/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6bd872544a098ec0bb723dd97a0c5dc0c320b4de Binary files /dev/null and b/examples/various/tacotron2_sample4/deniro.mp3 differ diff --git a/examples/various/tacotron2_sample4/emma.mp3 b/examples/various/tacotron2_sample4/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..37d349fac51669988c1cc0fc89da0958fe293b57 Binary files /dev/null and b/examples/various/tacotron2_sample4/emma.mp3 differ diff --git a/examples/various/tacotron2_sample4/freeman.mp3 b/examples/various/tacotron2_sample4/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..70d94c95de46fa5e334dd91c45fc3cf85c5700a0 Binary files /dev/null and b/examples/various/tacotron2_sample4/freeman.mp3 differ diff --git a/examples/various/tacotron2_sample4/geralt.mp3 b/examples/various/tacotron2_sample4/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..48a311110c0ca43d767881018ebe53901bf7d7f5 Binary files /dev/null and b/examples/various/tacotron2_sample4/geralt.mp3 differ diff --git a/examples/various/tacotron2_sample4/grace_train.mp3 b/examples/various/tacotron2_sample4/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5d4aa154fcf5b3df047972ae83a80a1e3b9c0c8d Binary files /dev/null and b/examples/various/tacotron2_sample4/grace_train.mp3 differ diff --git a/examples/various/tacotron2_sample4/halle.mp3 b/examples/various/tacotron2_sample4/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..8983eb27485c95d0358430919baf82a845fd910a Binary files /dev/null and b/examples/various/tacotron2_sample4/halle.mp3 differ diff --git a/examples/various/tacotron2_sample4/jlaw.mp3 b/examples/various/tacotron2_sample4/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b7876d8bbaec2ab2692203d908d08ff960c3f366 Binary files /dev/null and b/examples/various/tacotron2_sample4/jlaw.mp3 differ diff --git a/examples/various/tacotron2_sample4/lj.mp3 b/examples/various/tacotron2_sample4/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d8632aedb1843a17079b3cc70e8c35371c4ebe2e Binary files /dev/null and b/examples/various/tacotron2_sample4/lj.mp3 differ diff --git a/examples/various/tacotron2_sample4/myself.mp3 b/examples/various/tacotron2_sample4/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c6655e4a36b70fe5a24cb59b36b27f05072f74eb Binary files /dev/null and b/examples/various/tacotron2_sample4/myself.mp3 differ diff --git a/examples/various/tacotron2_sample4/pat.mp3 b/examples/various/tacotron2_sample4/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3bc96d12b863d0f0a34b4ed977dde068142bb3a2 Binary files /dev/null and b/examples/various/tacotron2_sample4/pat.mp3 differ diff --git a/examples/various/tacotron2_sample4/snakes.mp3 b/examples/various/tacotron2_sample4/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..768ea835bfcb566e940b3f30bd07726434e9a0b9 Binary files /dev/null and b/examples/various/tacotron2_sample4/snakes.mp3 differ diff --git a/examples/various/tacotron2_sample4/tom.mp3 b/examples/various/tacotron2_sample4/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1fbf11359826df5667546b7d9645aeaeeb803518 Binary files /dev/null and b/examples/various/tacotron2_sample4/tom.mp3 differ diff --git a/examples/various/tacotron2_sample4/train_atkins.mp3 b/examples/various/tacotron2_sample4/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..850973b23764dadcd993f8f6b771f92cc88e4b15 Binary files /dev/null and b/examples/various/tacotron2_sample4/train_atkins.mp3 differ diff --git a/examples/various/tacotron2_sample4/train_dotrice.mp3 b/examples/various/tacotron2_sample4/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9da2c9373c762dc44c3460406fe269824bf6925b Binary files /dev/null and b/examples/various/tacotron2_sample4/train_dotrice.mp3 differ diff --git a/examples/various/tacotron2_sample4/train_kennard.mp3 b/examples/various/tacotron2_sample4/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..cfbedd4270b969e39917fd66b7c7ad1f34d3aa56 Binary files /dev/null and b/examples/various/tacotron2_sample4/train_kennard.mp3 differ diff --git a/examples/various/tacotron2_sample4/weaver.mp3 b/examples/various/tacotron2_sample4/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..894a4f859aee88347876095a455623a3f0830ba1 Binary files /dev/null and b/examples/various/tacotron2_sample4/weaver.mp3 differ diff --git a/examples/various/tacotron2_sample4/william.mp3 b/examples/various/tacotron2_sample4/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b49fdb928c2ce1811a94211f4bca81e29f9bc1ac Binary files /dev/null and b/examples/various/tacotron2_sample4/william.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/angie.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..82d554cb0e1c4b9d80eac30a824d943e2a65677e Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/angie.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/daniel.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0bd5e634330135cb8fba0571e936526e6cab989f Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/daniel.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/deniro.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..953151fcb9519d58f958e59b18c69e7b809e509b Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/deniro.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/emma.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..13c43dbf67614d5969efb1bb719f1e6dc2fa6611 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/emma.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/freeman.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..c4cef5f7005ac0d5d768a2be3c4b86432b21827a Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/freeman.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/geralt.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7e7bf5da24be3c660b43c4cd981016e4e3e66591 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/geralt.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/grace_train.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..5d039296c2366ee3cfbd639624c6688a5f562fd9 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/grace_train.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/halle.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1ea810c1ca67457b2acaeb961b1eb7c30a0f76ee Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/halle.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/jlaw.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e23272e57a50470691e4da6326038f4896d491d5 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/jlaw.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/lj.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..70a6a71128b580af5b0f9c7d3ee1adb562b77fde Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/lj.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/myself.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..33415a4e0436614b066a7dae7a0c57f2b55c3471 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/myself.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/pat.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1e073bdb903fd582a15ef1f07507847ec5b67c3f Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/pat.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/snakes.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..1e9a9d5bf37fd17ce41135a670695eedfaa6f7a6 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/snakes.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/tom.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f208bdb0de1f413c83951a204ee92205c4f1d34c Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/tom.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/train_atkins.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..672bb75031c74a52d9405a84da968ab67388b6a1 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/train_atkins.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/train_dotrice.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e91fba901f4f2d164f3ba067ca0abf04c8a059f4 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/train_dotrice.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/train_kennard.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..69a3b4a363256f4e582cfe370fffde9a61853c83 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/train_kennard.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/weaver.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..41ba690f01368a087a3579b72cae99b9435ec934 Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/weaver.mp3 differ diff --git a/examples/various/watts_this_is_the_real_secret_of_life/william.mp3 b/examples/various/watts_this_is_the_real_secret_of_life/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6717b9d13682f626c59a371a74fabd74a9fc903d Binary files /dev/null and b/examples/various/watts_this_is_the_real_secret_of_life/william.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/angie.mp3 b/examples/various/wilde_nowadays_people_know_the_price/angie.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a654e45d8713ed942b8ad5d7a2685bcf6e2bce40 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/angie.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/daniel.mp3 b/examples/various/wilde_nowadays_people_know_the_price/daniel.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..84e9b5e5b0870fa5dbcf6a6aa7b220c9e63b5452 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/daniel.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/deniro.mp3 b/examples/various/wilde_nowadays_people_know_the_price/deniro.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..2203b70ccf60c579e6e2978fa4a3078c85583952 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/deniro.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/emma.mp3 b/examples/various/wilde_nowadays_people_know_the_price/emma.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..0d317d1c2b8d47f15b5e471b3b06dbe811bf43a1 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/emma.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/freeman.mp3 b/examples/various/wilde_nowadays_people_know_the_price/freeman.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..95b08b184f9d13651ba8b6f1cdee8a30d7ad02c9 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/freeman.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/geralt.mp3 b/examples/various/wilde_nowadays_people_know_the_price/geralt.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..7125761bc77c4eb6dcf329420e5943805fdda710 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/geralt.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/grace_train.mp3 b/examples/various/wilde_nowadays_people_know_the_price/grace_train.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..722d7efbffc619e99fb0888772527250b117ba21 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/grace_train.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/halle.mp3 b/examples/various/wilde_nowadays_people_know_the_price/halle.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..f280f68c31985ef1f1bc4178d262b3e6414384ed Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/halle.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/jlaw.mp3 b/examples/various/wilde_nowadays_people_know_the_price/jlaw.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..897e55984e07aa64a2bfbd3241460f573ea73e48 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/jlaw.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/lj.mp3 b/examples/various/wilde_nowadays_people_know_the_price/lj.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..94d60bf5aad7516909ece9ca691a2dc7bbbfa072 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/lj.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/myself.mp3 b/examples/various/wilde_nowadays_people_know_the_price/myself.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..26c4e7f21e86bd3536b5526d583a175c8bc60108 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/myself.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/pat.mp3 b/examples/various/wilde_nowadays_people_know_the_price/pat.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a40338ca62ed37bbb1f80336dca4c919970ae2df Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/pat.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/snakes.mp3 b/examples/various/wilde_nowadays_people_know_the_price/snakes.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..d0e1a6c0a22cdf77ebd89ac71250db23198cfdc7 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/snakes.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/tom.mp3 b/examples/various/wilde_nowadays_people_know_the_price/tom.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..3a4e274a31fb6b736a1b1cb14397b7960c016aaa Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/tom.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/train_atkins.mp3 b/examples/various/wilde_nowadays_people_know_the_price/train_atkins.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..a7ae9ee50ebac567c284806d43a0b54b5444b456 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/train_atkins.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/train_dotrice.mp3 b/examples/various/wilde_nowadays_people_know_the_price/train_dotrice.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..61966f40f5d02c5e98b80ebe079212d577481b1e Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/train_dotrice.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/train_kennard.mp3 b/examples/various/wilde_nowadays_people_know_the_price/train_kennard.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..19f051fdc0e40b0c96be86605e11ee738c50ed7d Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/train_kennard.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/weaver.mp3 b/examples/various/wilde_nowadays_people_know_the_price/weaver.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4d3779fa6ceb33633be18d37f6c5d38bf18cfaba Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/weaver.mp3 differ diff --git a/examples/various/wilde_nowadays_people_know_the_price/william.mp3 b/examples/various/wilde_nowadays_people_know_the_price/william.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..9f7291943b02d63dee4294182da0f565048a9934 Binary files /dev/null and b/examples/various/wilde_nowadays_people_know_the_price/william.mp3 differ diff --git a/is_this_from_tortoise.py b/is_this_from_tortoise.py new file mode 100644 index 0000000000000000000000000000000000000000..550b33e61c13c7ffe9509ae2b07d81903ee7cb38 --- /dev/null +++ b/is_this_from_tortoise.py @@ -0,0 +1,14 @@ +import argparse + +from api import classify_audio_clip +from utils.audio import load_audio + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--clip', type=str, help='Path to an audio clip to classify.', default="results/favorite_riding_hood.mp3") + args = parser.parse_args() + + clip = load_audio(args.clip, 24000) + clip = clip[:, :220000] + prob = classify_audio_clip(clip) + print(f"This classifier thinks there is a {prob*100}% chance that this clip was generated from Tortoise.") \ No newline at end of file diff --git a/models/arch_util.py b/models/arch_util.py new file mode 100644 index 0000000000000000000000000000000000000000..832315c15c7c2a182d1f0d9fa0d971299e05d2f1 --- /dev/null +++ b/models/arch_util.py @@ -0,0 +1,367 @@ +import functools +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio +from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def normalization(channels): + """ + Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + groups = 32 + if channels <= 16: + groups = 8 + elif channels <= 64: + groups = 16 + while channels % groups != 0: + groups = int(groups / 2) + assert groups > 2 + return GroupNorm32(groups, channels) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv, mask=None, rel_pos=None): + """ + Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = torch.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + if rel_pos is not None: + weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1]) + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + if mask is not None: + # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs. + mask = mask.repeat(self.n_heads, 1).unsqueeze(1) + weight = weight * mask + a = torch.einsum("bts,bcs->bct", weight, v) + + return a.reshape(bs, -1, length) + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + do_checkpoint=True, + relative_pos_embeddings=False, + ): + super().__init__() + self.channels = channels + self.do_checkpoint = do_checkpoint + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.norm = normalization(channels) + self.qkv = nn.Conv1d(channels, channels * 3, 1) + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) + if relative_pos_embeddings: + self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) + else: + self.relative_pos_embeddings = None + + def forward(self, x, mask=None): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv, mask, self.relative_pos_embeddings) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + """ + + def __init__(self, channels, use_conv, out_channels=None, factor=4): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.factor = factor + if use_conv: + ksize = 5 + pad = 2 + self.conv = nn.Conv1d(self.channels, self.out_channels, ksize, padding=pad) + + def forward(self, x): + assert x.shape[1] == self.channels + x = F.interpolate(x, scale_factor=self.factor, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + """ + + def __init__(self, channels, use_conv, out_channels=None, factor=4, ksize=5, pad=2): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + + stride = factor + if use_conv: + self.op = nn.Conv1d( + self.channels, self.out_channels, ksize, stride=stride, padding=pad + ) + else: + assert self.channels == self.out_channels + self.op = nn.AvgPool1d(kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(nn.Module): + def __init__( + self, + channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + up=False, + down=False, + kernel_size=3, + ): + super().__init__() + self.channels = channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_scale_shift_norm = use_scale_shift_norm + padding = 1 if kernel_size == 3 else 2 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False) + self.x_upd = Upsample(channels, False) + elif down: + self.h_upd = Downsample(channels, False) + self.x_upd = Downsample(channels, False) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = nn.Conv1d( + channels, self.out_channels, kernel_size, padding=padding + ) + else: + self.skip_connection = nn.Conv1d(channels, self.out_channels, 1) + + def forward(self, x): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AudioMiniEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + base_channels=128, + depth=2, + resnet_blocks=2, + attn_blocks=4, + num_attn_heads=4, + dropout=0, + downsample_factor=2, + kernel_size=3): + super().__init__() + self.init = nn.Sequential( + nn.Conv1d(spec_dim, base_channels, 3, padding=1) + ) + ch = base_channels + res = [] + for l in range(depth): + for r in range(resnet_blocks): + res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) + res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) + ch *= 2 + self.res = nn.Sequential(*res) + self.final = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.Conv1d(ch, embedding_dim, 1) + ) + attn = [] + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads,)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + + def forward(self, x): + h = self.init(x) + h = self.res(h) + h = self.final(h) + h = self.attn(h) + return h[:, :, 0] + + +class TorchMelSpectrogram(nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, mel_fmin=0, mel_fmax=8000, + sampling_rate=22050, normalize=False, mel_norm_file='data/mel_norms.pth'): + super().__init__() + # These are the default tacotron values for the MEL spectrogram. + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.n_mel_channels = n_mel_channels + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.sampling_rate = sampling_rate + self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length, + win_length=self.win_length, power=2, normalized=normalize, + sample_rate=self.sampling_rate, f_min=self.mel_fmin, + f_max=self.mel_fmax, n_mels=self.n_mel_channels, + norm="slaney") + self.mel_norm_file = mel_norm_file + if self.mel_norm_file is not None: + self.mel_norms = torch.load(self.mel_norm_file) + else: + self.mel_norms = None + + def forward(self, inp): + if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) + inp = inp.squeeze(1) + assert len(inp.shape) == 2 + self.mel_stft = self.mel_stft.to(inp.device) + mel = self.mel_stft(inp) + # Perform dynamic range compression + mel = torch.log(torch.clamp(mel, min=1e-5)) + if self.mel_norms is not None: + self.mel_norms = self.mel_norms.to(mel.device) + mel = mel / self.mel_norms.unsqueeze(0).unsqueeze(-1) + return mel + + +class CheckpointedLayer(nn.Module): + """ + Wraps a module. When forward() is called, passes kwargs that require_grad through torch.checkpoint() and bypasses + checkpoint for all other args. + """ + def __init__(self, wrap): + super().__init__() + self.wrap = wrap + + def forward(self, x, *args, **kwargs): + for k, v in kwargs.items(): + assert not (isinstance(v, torch.Tensor) and v.requires_grad) # This would screw up checkpointing. + partial = functools.partial(self.wrap, **kwargs) + return torch.utils.checkpoint.checkpoint(partial, x, *args) + + +class CheckpointedXTransformerEncoder(nn.Module): + """ + Wraps a ContinuousTransformerWrapper and applies CheckpointedLayer to each layer and permutes from channels-mid + to channels-last that XTransformer expects. + """ + def __init__(self, needs_permute=True, exit_permute=True, checkpoint=True, **xtransformer_kwargs): + super().__init__() + self.transformer = ContinuousTransformerWrapper(**xtransformer_kwargs) + self.needs_permute = needs_permute + self.exit_permute = exit_permute + + if not checkpoint: + return + for i in range(len(self.transformer.attn_layers.layers)): + n, b, r = self.transformer.attn_layers.layers[i] + self.transformer.attn_layers.layers[i] = nn.ModuleList([n, CheckpointedLayer(b), r]) + + def forward(self, x, **kwargs): + if self.needs_permute: + x = x.permute(0,2,1) + h = self.transformer(x, **kwargs) + if self.exit_permute: + h = h.permute(0,2,1) + return h \ No newline at end of file diff --git a/models/autoregressive.py b/models/autoregressive.py new file mode 100644 index 0000000000000000000000000000000000000000..6a91748d01ce35672554a8f39a0ca82fb562846b --- /dev/null +++ b/models/autoregressive.py @@ -0,0 +1,577 @@ +import functools + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList +from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions +from transformers.utils.model_parallel_utils import get_device_map, assert_device_map +from models.arch_util import AttentionBlock +from utils.typical_sampling import TypicalLogitsWarper + + +def null_position_embeddings(range, dim): + return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device) + + +class ResBlock(nn.Module): + """ + Basic residual convolutional block that uses GroupNorm. + """ + def __init__(self, chan): + super().__init__() + self.net = nn.Sequential( + nn.Conv1d(chan, chan, kernel_size=3, padding=1), + nn.GroupNorm(chan//8, chan), + nn.ReLU(), + nn.Conv1d(chan, chan, kernel_size=3, padding=1), + nn.GroupNorm(chan//8, chan) + ) + + def forward(self, x): + return F.relu(self.net(x) + x) + + +class GPT2InferenceModel(GPT2PreTrainedModel): + def __init__(self, config, gpt, text_pos_emb, embeddings, norm, linear): + super().__init__(config) + self.transformer = gpt + self.text_pos_embedding = text_pos_emb + self.embeddings = embeddings + self.lm_head = nn.Sequential(norm, linear) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.cached_mel_emb = None + + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.model_parallel = True + + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def store_mel_emb(self, mel_emb): + self.cached_mel_emb = mel_emb + + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + return { + "input_ids": input_ids, + "past_key_values": past, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + assert self.cached_mel_emb is not None + assert inputs_embeds is None # Not supported by this inference model. + assert labels is None # Training not supported by this inference model. + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Create embedding + mel_len = self.cached_mel_emb.shape[1] + if input_ids.shape[1] != 1: + text_inputs = input_ids[:, mel_len:] + text_emb = self.embeddings(text_inputs) + text_emb = text_emb + self.text_pos_embedding(text_emb) + if self.cached_mel_emb.shape[0] != text_emb.shape[0]: + mel_emb = self.cached_mel_emb.repeat_interleave(text_emb.shape[0]//self.cached_mel_emb.shape[0], 0) + else: + mel_emb = self.cached_mel_emb + emb = torch.cat([mel_emb, text_emb], dim=1) + else: + emb = self.embeddings(input_ids) + emb = emb + self.text_pos_embedding.get_fixed_embedding(attention_mask.shape[1]-mel_len, attention_mask.device) + + transformer_outputs = self.transformer( + inputs_embeds=emb, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + transformer_outputs[1:] + + return CausalLMOutputWithCrossAttentions( + loss=None, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past, beam_idx): + """ + This function is used to re-order the :obj:`past_key_values` cache if + :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is + called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +class ConditioningEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + attn_blocks=6, + num_attn_heads=4, + do_checkpointing=False, + mean=False): + super().__init__() + attn = [] + self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1) + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + self.do_checkpointing = do_checkpointing + self.mean = mean + + def forward(self, x): + h = self.init(x) + h = self.attn(h) + if self.mean: + return h.mean(dim=2) + else: + return h[:, :, 0] + + +class LearnedPositionEmbeddings(nn.Module): + def __init__(self, seq_len, model_dim, init=.02): + super().__init__() + self.emb = nn.Embedding(seq_len, model_dim) + # Initializing this way is standard for GPT-2 + self.emb.weight.data.normal_(mean=0.0, std=init) + + def forward(self, x): + sl = x.shape[1] + return self.emb(torch.arange(0, sl, device=x.device)) + + def get_fixed_embedding(self, ind, dev): + return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) + + +def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing): + """ + GPT-2 implemented by the HuggingFace library. + """ + from transformers import GPT2Config, GPT2Model + gpt_config = GPT2Config(vocab_size=256, # Unused. + n_positions=max_mel_seq_len+max_text_seq_len, + n_ctx=max_mel_seq_len+max_text_seq_len, + n_embd=model_dim, + n_layer=layers, + n_head=heads, + gradient_checkpointing=checkpointing, + use_cache=not checkpointing) + gpt = GPT2Model(gpt_config) + # Override the built in positional embeddings + del gpt.wpe + gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) + # Built-in token embeddings are unused. + del gpt.wte + return gpt, LearnedPositionEmbeddings(max_mel_seq_len, model_dim), LearnedPositionEmbeddings(max_text_seq_len, model_dim),\ + None, None + + +class MelEncoder(nn.Module): + def __init__(self, channels, mel_channels=80, resblocks_per_reduction=2): + super().__init__() + self.channels = channels + self.encoder = nn.Sequential(nn.Conv1d(mel_channels, channels//4, kernel_size=3, padding=1), + nn.Sequential(*[ResBlock(channels//4) for _ in range(resblocks_per_reduction)]), + nn.Conv1d(channels//4, channels//2, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(channels//16, channels//2), + nn.ReLU(), + nn.Sequential(*[ResBlock(channels//2) for _ in range(resblocks_per_reduction)]), + nn.Conv1d(channels//2, channels, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(channels//8, channels), + nn.ReLU(), + nn.Sequential(*[ResBlock(channels) for _ in range(resblocks_per_reduction)]), + ) + self.reduction = 4 + + + def forward(self, x): + for e in self.encoder: + x = e(x) + return x.permute(0,2,1) + + +class UnifiedVoice(nn.Module): + def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, max_conditioning_inputs=1, + mel_length_compression=1024, number_text_tokens=256, + start_text_token=None, number_mel_codes=8194, start_mel_token=8192, + stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True, + checkpointing=True, average_conditioning_embeddings=False, + types=1): + """ + Args: + layers: Number of layers in transformer stack. + model_dim: Operating dimensions of the transformer + heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64 + max_text_tokens: Maximum number of text tokens that will be encountered by model. + max_mel_tokens: Maximum number of MEL tokens that will be encountered by model. + max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s). + mel_length_compression: The factor between and . Used to compute MEL code padding given wav input length. + number_text_tokens: + start_text_token: + stop_text_token: + number_mel_codes: + start_mel_token: + stop_mel_token: + train_solo_embeddings: + use_mel_codes_as_input: + checkpointing: + average_conditioning_embeddings: Whether or not conditioning embeddings should be averaged, instead of fed piecewise into the model. + """ + super().__init__() + + self.number_text_tokens = number_text_tokens + self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token + self.stop_text_token = 0 + self.number_mel_codes = number_mel_codes + self.start_mel_token = start_mel_token + self.stop_mel_token = stop_mel_token + self.layers = layers + self.heads = heads + self.max_mel_tokens = max_mel_tokens + self.max_text_tokens = max_text_tokens + self.model_dim = model_dim + self.max_conditioning_inputs = max_conditioning_inputs + self.mel_length_compression = mel_length_compression + self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) + self.average_conditioning_embeddings = average_conditioning_embeddings + self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim) + if use_mel_codes_as_input: + self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim) + else: + self.mel_embedding = MelEncoder(model_dim, resblocks_per_reduction=1) + self.gpt, self.mel_pos_embedding, self.text_pos_embedding, self.mel_layer_pos_embedding, self.text_layer_pos_embedding = \ + build_hf_gpt_transformer(layers, model_dim, heads, self.max_mel_tokens+2+self.max_conditioning_inputs, self.max_text_tokens+2, checkpointing) + if train_solo_embeddings: + self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True) + self.text_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True) + else: + self.mel_solo_embedding = 0 + self.text_solo_embedding = 0 + + self.final_norm = nn.LayerNorm(model_dim) + self.text_head = nn.Linear(model_dim, self.number_text_tokens*types+1) + self.mel_head = nn.Linear(model_dim, self.number_mel_codes) + + # Initialize the embeddings per the GPT-2 scheme + embeddings = [self.text_embedding] + if use_mel_codes_as_input: + embeddings.append(self.mel_embedding) + for module in embeddings: + module.weight.data.normal_(mean=0.0, std=.02) + + def build_aligned_inputs_and_targets(self, input, start_token, stop_token): + inp = F.pad(input, (1,0), value=start_token) + tar = F.pad(input, (0,1), value=stop_token) + return inp, tar + + def set_mel_padding(self, mel_input_tokens, wav_lengths): + """ + Given mel tokens that are derived from a padded audio clip and the actual lengths of each batch element in + that audio clip, reformats the tokens with STOP_MEL_TOKEN in place of the zero padding. This is required + preformatting to create a working TTS model. + """ + # Set padding areas within MEL (currently it is coded with the MEL code for ). + mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode='trunc') + for b in range(len(mel_lengths)): + actual_end = mel_lengths[b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token. + if actual_end < mel_input_tokens.shape[-1]: + mel_input_tokens[b, actual_end:] = self.stop_mel_token + return mel_input_tokens + + def get_logits(self, speech_conditioning_inputs, first_inputs, first_head, second_inputs=None, second_head=None, get_attns=False, return_latent=False): + if second_inputs is not None: + emb = torch.cat([speech_conditioning_inputs, first_inputs, second_inputs], dim=1) + else: + emb = torch.cat([speech_conditioning_inputs, first_inputs], dim=1) + + gpt_out = self.gpt(inputs_embeds=emb, return_dict=True, output_attentions=get_attns) + if get_attns: + return gpt_out.attentions + + enc = gpt_out.last_hidden_state[:, 1:] # The first logit is tied to the speech_conditioning_input + enc = self.final_norm(enc) + + if return_latent: + return enc[:, speech_conditioning_inputs.shape[1]:speech_conditioning_inputs.shape[1]+first_inputs.shape[1]], enc[:, -second_inputs.shape[1]:] + + first_logits = enc[:, :first_inputs.shape[1]] + first_logits = first_head(first_logits) + first_logits = first_logits.permute(0,2,1) + if second_inputs is not None: + second_logits = enc[:, -second_inputs.shape[1]:] + second_logits = second_head(second_logits) + second_logits = second_logits.permute(0,2,1) + return first_logits, second_logits + else: + return first_logits + + def forward(self, speech_conditioning_input, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False, + return_latent=False, clip_inputs=True): + """ + Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode + (actuated by `text_first`). + + speech_conditioning_input: MEL float tensor, (b,80,s) + text_inputs: long tensor, (b,t) + text_lengths: long tensor, (b,) + mel_inputs: long tensor, (b,m) + wav_lengths: long tensor, (b,) + raw_mels: MEL float tensor (b,80,s) + + If return_attentions is specified, only logits are returned. + If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned. + If clip_inputs is True, the inputs will be clipped to the smallest input size across each input modality. + """ + # Types are expressed by expanding the text embedding space. + if types is not None: + text_inputs = text_inputs * (1+types).unsqueeze(-1) + + if clip_inputs: + # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by + # chopping the inputs by the maximum actual length. + max_text_len = text_lengths.max() + text_inputs = text_inputs[:, :max_text_len] + max_mel_len = wav_lengths.max() // self.mel_length_compression + mel_codes = mel_codes[:, :max_mel_len] + if raw_mels is not None: + raw_mels = raw_mels[:, :, :max_mel_len*4] + mel_codes = self.set_mel_padding(mel_codes, wav_lengths) + text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token) + mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token) + + speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds = torch.stack(conds, dim=1) + if self.average_conditioning_embeddings: + conds = conds.mean(dim=1).unsqueeze(1) + + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token) + if raw_mels is not None: + mel_inp = F.pad(raw_mels, (0, 8)) + else: + mel_inp = mel_codes + mel_emb = self.mel_embedding(mel_inp) + mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + + if text_first: + text_logits, mel_logits = self.get_logits(conds, text_emb, self.text_head, mel_emb, self.mel_head, get_attns=return_attentions, return_latent=return_latent) + if return_latent: + return mel_logits[:, :-2] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass. + else: + mel_logits, text_logits = self.get_logits(conds, mel_emb, self.mel_head, text_emb, self.text_head, get_attns=return_attentions, return_latent=return_latent) + if return_latent: + return text_logits[:, :-2] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass. + + if return_attentions: + return mel_logits + loss_text = F.cross_entropy(text_logits, text_targets.long()) + loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) + return loss_text.mean(), loss_mel.mean(), mel_logits + + def text_forward(self, speech_conditioning_input, text_inputs, text_lengths): + """ + Performs autoregressive modeling on only text. Still requires a speech_conditioning_input due to the way the + model inputs are formatted. Just provide any audio clip (arguably, zeros could be provided). + """ + assert self.max_text_tokens >= text_inputs.shape[1], f'{text_inputs.shape[1]}' + + # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by + # chopping the inputs by the maximum actual length. + max_text_len = text_lengths.max() + text_inputs = F.pad(text_inputs[:, :max_text_len], (0,1), value=self.stop_text_token) + + speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds = torch.stack(conds, dim=1) + if self.average_conditioning_embeddings: + conds = conds.mean(dim=1).unsqueeze(1) + + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + self.text_solo_embedding + text_logits = self.get_logits(conds, text_emb, self.text_head) + loss_text = F.cross_entropy(text_logits, text_targets.long()) + return loss_text.mean() + + def speech_forward(self, speech_conditioning_input, mel_codes, wav_lengths, raw_mels=None): + """ + Performs autoregressive modeling on only speech data. + """ + assert self.max_mel_tokens >= mel_codes.shape[1], f'{mel_codes.shape[1]}' + + # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by + # chopping the inputs by the maximum actual length. + max_mel_len = wav_lengths.max() // self.mel_length_compression + mel_codes = F.pad(mel_codes[:, :max_mel_len], (0,1), value=self.stop_mel_token) + mel_codes = self.set_mel_padding(mel_codes, wav_lengths) + if raw_mels is not None: + raw_mels = raw_mels[:, :, :max_mel_len*4] + + speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds = torch.stack(conds, dim=1) + if self.average_conditioning_embeddings: + conds = conds.mean(dim=1).unsqueeze(1) + + mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token) + if raw_mels is not None: + mel_inp = F.pad(raw_mels, (0, 4)) + else: + mel_inp = mel_codes + mel_emb = self.mel_embedding(mel_inp) + mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + self.mel_solo_embedding + mel_logits = self.get_logits(conds, mel_emb, self.mel_head) + loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) + return loss_mel.mean() + + def inference_speech(self, speech_conditioning_input, text_inputs, input_tokens=None, num_return_sequences=1, + max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs): + seq_length = self.max_mel_tokens + self.max_text_tokens + 2 + if not hasattr(self, 'inference_model'): + # TODO: Decouple gpt_config from this inference model. + gpt_config = GPT2Config(vocab_size=self.max_mel_tokens, + n_positions=seq_length, + n_ctx=seq_length, + n_embd=self.model_dim, + n_layer=self.layers, + n_head=self.heads, + gradient_checkpointing=False, + use_cache=True) + self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head) + self.gpt.wte = self.mel_embedding + + text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token) + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + + speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds = torch.stack(conds, dim=1) + if self.average_conditioning_embeddings: + conds = conds.mean(dim=1).unsqueeze(1) + + emb = torch.cat([conds, text_emb], dim=1) + self.inference_model.store_mel_emb(emb) + + fake_inputs = torch.full((emb.shape[0], conds.shape[1] + emb.shape[1],), fill_value=1, dtype=torch.long, + device=text_inputs.device) + fake_inputs[:, -1] = self.start_mel_token + trunc_index = fake_inputs.shape[1] + if input_tokens is None: + inputs = fake_inputs + else: + assert num_return_sequences % input_tokens.shape[0] == 0, "The number of return sequences must be divisible by the number of input sequences" + fake_inputs = fake_inputs.repeat(num_return_sequences, 1) + input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) + inputs = torch.cat([fake_inputs, input_tokens], dim=1) + + logits_processor = LogitsProcessorList([TypicalLogitsWarper(mass=typical_mass)]) if typical_sampling else LogitsProcessorList() + max_length = trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length + gen = self.inference_model.generate(inputs, bos_token_id=self.start_mel_token, pad_token_id=self.stop_mel_token, eos_token_id=self.stop_mel_token, + max_length=max_length, logits_processor=logits_processor, + num_return_sequences=num_return_sequences, **hf_generate_kwargs) + return gen[:, trunc_index:] + + +if __name__ == '__main__': + gpt = UnifiedVoice(model_dim=256, heads=4, train_solo_embeddings=True, use_mel_codes_as_input=True, max_conditioning_inputs=4) + l = gpt(torch.randn(2, 3, 80, 800), + torch.randint(high=120, size=(2,120)), + torch.tensor([32, 120]), + torch.randint(high=8192, size=(2,250)), + torch.tensor([250*256,195*256])) + gpt.text_forward(torch.randn(2,80,800), torch.randint(high=50, size=(2,80)), torch.tensor([32, 80])) diff --git a/models/classifier.py b/models/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..c8997738a2b689cb4bd744323339e5e8b46035ae --- /dev/null +++ b/models/classifier.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint + +from models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock + + +class ResBlock(nn.Module): + def __init__( + self, + channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + up=False, + down=False, + kernel_size=3, + do_checkpoint=True, + ): + super().__init__() + self.channels = channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_scale_shift_norm = use_scale_shift_norm + self.do_checkpoint = do_checkpoint + padding = 1 if kernel_size == 3 else 2 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = nn.Conv1d( + dims, channels, self.out_channels, kernel_size, padding=padding + ) + else: + self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, 1) + + def forward(self, x): + if self.do_checkpoint: + return checkpoint( + self._forward, x + ) + else: + return self._forward(x) + + def _forward(self, x): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AudioMiniEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + base_channels=128, + depth=2, + resnet_blocks=2, + attn_blocks=4, + num_attn_heads=4, + dropout=0, + downsample_factor=2, + kernel_size=3): + super().__init__() + self.init = nn.Sequential( + nn.Conv1d(spec_dim, base_channels, 3, padding=1) + ) + ch = base_channels + res = [] + self.layers = depth + for l in range(depth): + for r in range(resnet_blocks): + res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)) + res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) + ch *= 2 + self.res = nn.Sequential(*res) + self.final = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.Conv1d(ch, embedding_dim, 1) + ) + attn = [] + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + + def forward(self, x): + h = self.init(x) + h = self.res(h) + h = self.final(h) + for blk in self.attn: + h = checkpoint(blk, h) + return h[:, :, 0] + + +class AudioMiniEncoderWithClassifierHead(nn.Module): + def __init__(self, classes, distribute_zero_label=True, **kwargs): + super().__init__() + self.enc = AudioMiniEncoder(**kwargs) + self.head = nn.Linear(self.enc.dim, classes) + self.num_classes = classes + self.distribute_zero_label = distribute_zero_label + + def forward(self, x, labels=None): + h = self.enc(x) + logits = self.head(h) + if labels is None: + return logits + else: + if self.distribute_zero_label: + oh_labels = nn.functional.one_hot(labels, num_classes=self.num_classes) + zeros_indices = (labels == 0).unsqueeze(-1) + # Distribute 20% of the probability mass on all classes when zero is specified, to compensate for dataset noise. + zero_extra_mass = torch.full_like(oh_labels, dtype=torch.float, fill_value=.2/(self.num_classes-1)) + zero_extra_mass[:, 0] = -.2 + zero_extra_mass = zero_extra_mass * zeros_indices + oh_labels = oh_labels + zero_extra_mass + else: + oh_labels = labels + loss = nn.functional.cross_entropy(logits, oh_labels) + return loss diff --git a/models/clvp.py b/models/clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..1eec06a4004a7d02e2b327f20646324d5e21e53e --- /dev/null +++ b/models/clvp.py @@ -0,0 +1,155 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import einsum + +from models.arch_util import CheckpointedXTransformerEncoder +from models.transformer import Transformer +from models.xtransformers import Encoder + + +def exists(val): + return val is not None + + +def masked_mean(t, mask, dim = 1): + t = t.masked_fill(~mask[:, :, None], 0.) + return t.sum(dim = 1) / mask.sum(dim = 1)[..., None] + +class CLVP(nn.Module): + """ + CLIP model retrofitted for performing contrastive evaluation between tokenized audio data and the corresponding + transcribed text. + + Originally from https://github.com/lucidrains/DALLE-pytorch/blob/main/dalle_pytorch/dalle_pytorch.py + """ + + def __init__( + self, + *, + dim_text=512, + dim_speech=512, + dim_latent=512, + num_text_tokens=256, + text_enc_depth=6, + text_seq_len=120, + text_heads=8, + num_speech_tokens=8192, + speech_enc_depth=6, + speech_heads=8, + speech_seq_len=250, + text_mask_percentage=0, + voice_mask_percentage=0, + wav_token_compression=1024, + use_xformers=False, + ): + super().__init__() + self.text_emb = nn.Embedding(num_text_tokens, dim_text) + self.to_text_latent = nn.Linear(dim_text, dim_latent, bias=False) + + self.speech_emb = nn.Embedding(num_speech_tokens, dim_speech) + self.to_speech_latent = nn.Linear(dim_speech, dim_latent, bias=False) + + if use_xformers: + self.text_transformer = CheckpointedXTransformerEncoder( + needs_permute=False, + exit_permute=False, + max_seq_len=-1, + attn_layers=Encoder( + dim=dim_text, + depth=text_enc_depth, + heads=text_heads, + ff_dropout=.1, + ff_mult=2, + attn_dropout=.1, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + )) + self.speech_transformer = CheckpointedXTransformerEncoder( + needs_permute=False, + exit_permute=False, + max_seq_len=-1, + attn_layers=Encoder( + dim=dim_speech, + depth=speech_enc_depth, + heads=speech_heads, + ff_dropout=.1, + ff_mult=2, + attn_dropout=.1, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + )) + else: + self.text_transformer = Transformer(causal=False, seq_len=text_seq_len, dim=dim_text, depth=text_enc_depth, + heads=text_heads) + self.speech_transformer = Transformer(causal=False, seq_len=speech_seq_len, dim=dim_speech, + depth=speech_enc_depth, heads=speech_heads) + + self.temperature = nn.Parameter(torch.tensor(1.)) + self.text_mask_percentage = text_mask_percentage + self.voice_mask_percentage = voice_mask_percentage + self.wav_token_compression = wav_token_compression + self.xformers = use_xformers + if not use_xformers: + self.text_pos_emb = nn.Embedding(text_seq_len, dim_text) + self.speech_pos_emb = nn.Embedding(num_speech_tokens, dim_speech) + + def forward( + self, + text, + speech_tokens, + return_loss=False + ): + b, device = text.shape[0], text.device + if self.training: + text_mask = torch.rand_like(text.float()) > self.text_mask_percentage + voice_mask = torch.rand_like(speech_tokens.float()) > self.voice_mask_percentage + else: + text_mask = torch.ones_like(text.float()).bool() + voice_mask = torch.ones_like(speech_tokens.float()).bool() + + text_emb = self.text_emb(text) + speech_emb = self.speech_emb(speech_tokens) + + if not self.xformers: + text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device)) + speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device)) + + enc_text = self.text_transformer(text_emb, mask=text_mask) + enc_speech = self.speech_transformer(speech_emb, mask=voice_mask) + + text_latents = masked_mean(enc_text, text_mask, dim=1) + speech_latents = masked_mean(enc_speech, voice_mask, dim=1) + + text_latents = self.to_text_latent(text_latents) + speech_latents = self.to_speech_latent(speech_latents) + + text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents)) + + temp = self.temperature.exp() + + if not return_loss: + sim = einsum('n d, n d -> n', text_latents, speech_latents) * temp + return sim + + sim = einsum('i d, j d -> i j', text_latents, speech_latents) * temp + labels = torch.arange(b, device=device) + loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 + return loss + + +if __name__ == '__main__': + clip = CLVP(text_mask_percentage=.2, voice_mask_percentage=.2) + clip(torch.randint(0,256,(2,120)), + torch.tensor([50,100]), + torch.randint(0,8192,(2,250)), + torch.tensor([101,102]), + return_loss=True) + nonloss = clip(torch.randint(0,256,(2,120)), + torch.tensor([50,100]), + torch.randint(0,8192,(2,250)), + torch.tensor([101,102]), + return_loss=False) + print(nonloss.shape) \ No newline at end of file diff --git a/models/cvvp.py b/models/cvvp.py new file mode 100644 index 0000000000000000000000000000000000000000..0c9fd3500b38c126667b16bffd56f32ff89271a9 --- /dev/null +++ b/models/cvvp.py @@ -0,0 +1,133 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import einsum +from torch.utils.checkpoint import checkpoint + +from models.arch_util import AttentionBlock +from models.xtransformers import ContinuousTransformerWrapper, Encoder + + +def exists(val): + return val is not None + + +def masked_mean(t, mask): + t = t.masked_fill(~mask, 0.) + return t.sum(dim = 1) / mask.sum(dim = 1) + + +class CollapsingTransformer(nn.Module): + def __init__(self, model_dim, output_dims, heads, dropout, depth, mask_percentage=0, **encoder_kwargs): + super().__init__() + self.transformer = ContinuousTransformerWrapper( + max_seq_len=-1, + use_pos_emb=False, + attn_layers=Encoder( + dim=model_dim, + depth=depth, + heads=heads, + ff_dropout=dropout, + ff_mult=1, + attn_dropout=dropout, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + **encoder_kwargs, + )) + self.pre_combiner = nn.Sequential(nn.Conv1d(model_dim, output_dims, 1), + AttentionBlock(output_dims, num_heads=heads, do_checkpoint=False), + nn.Conv1d(output_dims, output_dims, 1)) + self.mask_percentage = mask_percentage + + def forward(self, x, **transformer_kwargs): + h = self.transformer(x, **transformer_kwargs) + h = h.permute(0,2,1) + h = checkpoint(self.pre_combiner, h).permute(0,2,1) + if self.training: + mask = torch.rand_like(h.float()) > self.mask_percentage + else: + mask = torch.ones_like(h.float()).bool() + return masked_mean(h, mask) + + +class ConvFormatEmbedding(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.emb = nn.Embedding(*args, **kwargs) + + def forward(self, x): + y = self.emb(x) + return y.permute(0,2,1) + + +class CVVP(nn.Module): + def __init__( + self, + model_dim=512, + transformer_heads=8, + dropout=.1, + conditioning_enc_depth=8, + cond_mask_percentage=0, + mel_channels=80, + mel_codes=None, + speech_enc_depth=8, + speech_mask_percentage=0, + latent_multiplier=1, + ): + super().__init__() + latent_dim = latent_multiplier*model_dim + self.temperature = nn.Parameter(torch.tensor(1.)) + + self.cond_emb = nn.Sequential(nn.Conv1d(mel_channels, model_dim//2, kernel_size=5, stride=2, padding=2), + nn.Conv1d(model_dim//2, model_dim, kernel_size=3, stride=2, padding=1)) + self.conditioning_transformer = CollapsingTransformer(model_dim, model_dim, transformer_heads, dropout, conditioning_enc_depth, cond_mask_percentage) + self.to_conditioning_latent = nn.Linear(latent_dim, latent_dim, bias=False) + + if mel_codes is None: + self.speech_emb = nn.Conv1d(mel_channels, model_dim, kernel_size=5, padding=2) + else: + self.speech_emb = ConvFormatEmbedding(mel_codes, model_dim) + self.speech_transformer = CollapsingTransformer(model_dim, latent_dim, transformer_heads, dropout, speech_enc_depth, speech_mask_percentage) + self.to_speech_latent = nn.Linear(latent_dim, latent_dim, bias=False) + + def get_grad_norm_parameter_groups(self): + return { + 'conditioning': list(self.conditioning_transformer.parameters()), + 'speech': list(self.speech_transformer.parameters()), + } + + def forward( + self, + mel_cond, + mel_input, + return_loss=False + ): + cond_emb = self.cond_emb(mel_cond).permute(0,2,1) + enc_cond = self.conditioning_transformer(cond_emb) + cond_latents = self.to_conditioning_latent(enc_cond) + + speech_emb = self.speech_emb(mel_input).permute(0,2,1) + enc_speech = self.speech_transformer(speech_emb) + speech_latents = self.to_speech_latent(enc_speech) + + + cond_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (cond_latents, speech_latents)) + temp = self.temperature.exp() + + if not return_loss: + sim = einsum('n d, n d -> n', cond_latents, speech_latents) * temp + return sim + + sim = einsum('i d, j d -> i j', cond_latents, speech_latents) * temp + labels = torch.arange(cond_latents.shape[0], device=mel_input.device) + loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 + + return loss + + +if __name__ == '__main__': + clvp = CVVP() + clvp(torch.randn(2,80,100), + torch.randn(2,80,95), + return_loss=True) \ No newline at end of file diff --git a/models/diffusion_decoder.py b/models/diffusion_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5fdf7ad86e696e70323ddfd60ca5c5f2ef2a8c06 --- /dev/null +++ b/models/diffusion_decoder.py @@ -0,0 +1,331 @@ +import math +import random +from abc import abstractmethod + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import autocast + +from models.arch_util import normalization, AttentionBlock + + +def is_latent(t): + return t.dtype == torch.float + + +def is_sequence(t): + return t.dtype == torch.long + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +class TimestepBlock(nn.Module): + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + def forward(self, x, emb): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + else: + x = layer(x) + return x + + +class ResBlock(TimestepBlock): + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + dims=2, + kernel_size=3, + efficient_config=True, + use_scale_shift_norm=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_scale_shift_norm = use_scale_shift_norm + padding = {1: 0, 3: 1, 5: 2}[kernel_size] + eff_kernel = 1 if efficient_config else 3 + eff_padding = 0 if efficient_config else 1 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding), + ) + + self.emb_layers = nn.Sequential( + nn.SiLU(), + nn.Linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + else: + self.skip_connection = nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding) + + def forward(self, x, emb): + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = torch.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class DiffusionLayer(TimestepBlock): + def __init__(self, model_channels, dropout, num_heads): + super().__init__() + self.resblk = ResBlock(model_channels, model_channels, dropout, model_channels, dims=1, use_scale_shift_norm=True) + self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True) + + def forward(self, x, time_emb): + y = self.resblk(x, time_emb) + return self.attn(y) + + +class DiffusionTts(nn.Module): + def __init__( + self, + model_channels=512, + num_layers=8, + in_channels=100, + in_latent_channels=512, + in_tokens=8193, + out_channels=200, # mean and variance + dropout=0, + use_fp16=False, + num_heads=16, + # Parameters for regularization. + layer_drop=.1, + unconditioned_percentage=.1, # This implements a mechanism similar to what is used in classifier-free training. + ): + super().__init__() + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.dropout = dropout + self.num_heads = num_heads + self.unconditioned_percentage = unconditioned_percentage + self.enable_fp16 = use_fp16 + self.layer_drop = layer_drop + + self.inp_block = nn.Conv1d(in_channels, model_channels, 3, 1, 1) + self.time_embed = nn.Sequential( + nn.Linear(model_channels, model_channels), + nn.SiLU(), + nn.Linear(model_channels, model_channels), + ) + + # Either code_converter or latent_converter is used, depending on what type of conditioning data is fed. + # This model is meant to be able to be trained on both for efficiency purposes - it is far less computationally + # complex to generate tokens, while generating latents will normally mean propagating through a deep autoregressive + # transformer network. + self.code_embedding = nn.Embedding(in_tokens, model_channels) + self.code_converter = nn.Sequential( + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + ) + self.code_norm = normalization(model_channels) + self.latent_conditioner = nn.Sequential( + nn.Conv1d(in_latent_channels, model_channels, 3, padding=1), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + ) + self.contextual_embedder = nn.Sequential(nn.Conv1d(in_channels,model_channels,3,padding=1,stride=2), + nn.Conv1d(model_channels, model_channels*2,3,padding=1,stride=2), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False)) + self.unconditioned_embedding = nn.Parameter(torch.randn(1,model_channels,1)) + self.conditioning_timestep_integrator = TimestepEmbedSequential( + DiffusionLayer(model_channels, dropout, num_heads), + DiffusionLayer(model_channels, dropout, num_heads), + DiffusionLayer(model_channels, dropout, num_heads), + ) + + self.integrating_conv = nn.Conv1d(model_channels*2, model_channels, kernel_size=1) + self.mel_head = nn.Conv1d(model_channels, in_channels, kernel_size=3, padding=1) + + self.layers = nn.ModuleList([DiffusionLayer(model_channels, dropout, num_heads) for _ in range(num_layers)] + + [ResBlock(model_channels, model_channels, dropout, dims=1, use_scale_shift_norm=True) for _ in range(3)]) + + self.out = nn.Sequential( + normalization(model_channels), + nn.SiLU(), + nn.Conv1d(model_channels, out_channels, 3, padding=1), + ) + + def get_grad_norm_parameter_groups(self): + groups = { + 'minicoder': list(self.contextual_embedder.parameters()), + 'layers': list(self.layers.parameters()), + 'code_converters': list(self.code_embedding.parameters()) + list(self.code_converter.parameters()) + list(self.latent_conditioner.parameters()) + list(self.latent_conditioner.parameters()), + 'timestep_integrator': list(self.conditioning_timestep_integrator.parameters()) + list(self.integrating_conv.parameters()), + 'time_embed': list(self.time_embed.parameters()), + } + return groups + + def timestep_independent(self, aligned_conditioning, conditioning_input, expected_seq_len, return_code_pred): + # Shuffle aligned_latent to BxCxS format + if is_latent(aligned_conditioning): + aligned_conditioning = aligned_conditioning.permute(0, 2, 1) + + # Note: this block does not need to repeated on inference, since it is not timestep-dependent or x-dependent. + speech_conditioning_input = conditioning_input.unsqueeze(1) if len( + conditioning_input.shape) == 3 else conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.contextual_embedder(speech_conditioning_input[:, j])) + conds = torch.cat(conds, dim=-1) + cond_emb = conds.mean(dim=-1) + cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1) + if is_latent(aligned_conditioning): + code_emb = self.latent_conditioner(aligned_conditioning) + else: + code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1) + code_emb = self.code_converter(code_emb) + code_emb = self.code_norm(code_emb) * (1 + cond_scale.unsqueeze(-1)) + cond_shift.unsqueeze(-1) + + unconditioned_batches = torch.zeros((code_emb.shape[0], 1, 1), device=code_emb.device) + # Mask out the conditioning branch for whole batch elements, implementing something similar to classifier-free guidance. + if self.training and self.unconditioned_percentage > 0: + unconditioned_batches = torch.rand((code_emb.shape[0], 1, 1), + device=code_emb.device) < self.unconditioned_percentage + code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1), + code_emb) + expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode='nearest') + + if not return_code_pred: + return expanded_code_emb + else: + mel_pred = self.mel_head(expanded_code_emb) + # Multiply mel_pred by !unconditioned_branches, which drops the gradient on unconditioned branches. This is because we don't want that gradient being used to train parameters through the codes_embedder as it unbalances contributions to that network from the MSE loss. + mel_pred = mel_pred * unconditioned_batches.logical_not() + return expanded_code_emb, mel_pred + + def forward(self, x, timesteps, aligned_conditioning=None, conditioning_input=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced. + :param conditioning_input: a full-resolution audio clip that is used as a reference to the style you want decoded. + :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent() + :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered. + :return: an [N x C x ...] Tensor of outputs. + """ + assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and conditioning_input is not None) + assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive. + + unused_params = [] + if conditioning_free: + code_emb = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1]) + unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) + unused_params.extend(list(self.latent_conditioner.parameters())) + else: + if precomputed_aligned_embeddings is not None: + code_emb = precomputed_aligned_embeddings + else: + code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_input, x.shape[-1], True) + if is_latent(aligned_conditioning): + unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) + else: + unused_params.extend(list(self.latent_conditioner.parameters())) + + unused_params.append(self.unconditioned_embedding) + + time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + code_emb = self.conditioning_timestep_integrator(code_emb, time_emb) + x = self.inp_block(x) + x = torch.cat([x, code_emb], dim=1) + x = self.integrating_conv(x) + for i, lyr in enumerate(self.layers): + # Do layer drop where applicable. Do not drop first and last layers. + if self.training and self.layer_drop > 0 and i != 0 and i != (len(self.layers)-1) and random.random() < self.layer_drop: + unused_params.extend(list(lyr.parameters())) + else: + # First and last blocks will have autocast disabled for improved precision. + with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): + x = lyr(x, time_emb) + + x = x.float() + out = self.out(x) + + # Involve probabilistic or possibly unused parameters in loss so we don't get DDP errors. + extraneous_addition = 0 + for p in unused_params: + extraneous_addition = extraneous_addition + p.mean() + out = out + extraneous_addition * 0 + + if return_code_pred: + return out, mel_pred + return out + + +if __name__ == '__main__': + clip = torch.randn(2, 100, 400) + aligned_latent = torch.randn(2,388,512) + aligned_sequence = torch.randint(0,8192,(2,100)) + cond = torch.randn(2, 100, 400) + ts = torch.LongTensor([600, 600]) + model = DiffusionTts(512, layer_drop=.3, unconditioned_percentage=.5) + # Test with latent aligned conditioning + #o = model(clip, ts, aligned_latent, cond) + # Test with sequence aligned conditioning + o = model(clip, ts, aligned_sequence, cond) + diff --git a/models/transformer.py b/models/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..aa59b462a3f9c2680f28ceb1b87480258f0293f0 --- /dev/null +++ b/models/transformer.py @@ -0,0 +1,219 @@ +from functools import partial + +import torch +import torch.nn.functional as F +from einops import rearrange +from rotary_embedding_torch import RotaryEmbedding, broadcat +from torch import nn + + +# helpers + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val, depth = 1): + if isinstance(val, list): + val = tuple(val) + return val if isinstance(val, tuple) else (val,) * depth + + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + + +def stable_softmax(t, dim = -1, alpha = 32 ** 2): + t = t / alpha + t = t - torch.amax(t, dim = dim, keepdim = True).detach() + return (t * alpha).softmax(dim = dim) + + +def route_args(router, args, depth): + routed_args = [(dict(), dict()) for _ in range(depth)] + matched_keys = [key for key in args.keys() if key in router] + + for key in matched_keys: + val = args[key] + for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])): + new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes) + routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args}) + return routed_args + + +# classes +class SequentialSequence(nn.Module): + def __init__(self, layers, args_route = {}, layer_dropout = 0.): + super().__init__() + assert all(len(route) == len(layers) for route in args_route.values()), 'each argument route map must have the same depth as the number of sequential layers' + self.layers = layers + self.args_route = args_route + self.layer_dropout = layer_dropout + + def forward(self, x, **kwargs): + args = route_args(self.args_route, kwargs, len(self.layers)) + layers_and_args = list(zip(self.layers, args)) + + for (f, g), (f_args, g_args) in layers_and_args: + x = x + f(x, **f_args) + x = x + g(x, **g_args) + return x + + +class DivideMax(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + maxes = x.amax(dim = self.dim, keepdim = True).detach() + return x / maxes + + +# https://arxiv.org/abs/2103.17239 +class LayerScale(nn.Module): + def __init__(self, dim, depth, fn): + super().__init__() + if depth <= 18: + init_eps = 0.1 + elif depth > 18 and depth <= 24: + init_eps = 1e-5 + else: + init_eps = 1e-6 + + scale = torch.zeros(1, 1, dim).fill_(init_eps) + self.scale = nn.Parameter(scale) + self.fn = fn + def forward(self, x, **kwargs): + return self.fn(x, **kwargs) * self.scale + +# layer norm + + +class PreNorm(nn.Module): + def __init__(self, dim, fn, sandwich = False): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.norm_out = nn.LayerNorm(dim) if sandwich else nn.Identity() + self.fn = fn + + def forward(self, x, **kwargs): + x = self.norm(x) + x = self.fn(x, **kwargs) + return self.norm_out(x) + +# feed forward + + +class GEGLU(nn.Module): + def forward(self, x): + x, gates = x.chunk(2, dim = -1) + return x * F.gelu(gates) + + +class FeedForward(nn.Module): + def __init__(self, dim, dropout = 0., mult = 4.): + super().__init__() + self.net = nn.Sequential( + nn.Linear(dim, dim * mult * 2), + GEGLU(), + nn.Dropout(dropout), + nn.Linear(dim * mult, dim) + ) + + def forward(self, x): + return self.net(x) + +# Attention + + +class Attention(nn.Module): + def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0.): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.seq_len = seq_len + self.scale = dim_head ** -0.5 + + self.causal = causal + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) + self.to_out = nn.Sequential( + nn.Linear(inner_dim, dim), + nn.Dropout(dropout) + ) + + def forward(self, x, mask = None): + b, n, _, h, device = *x.shape, self.heads, x.device + softmax = torch.softmax + + qkv = self.to_qkv(x).chunk(3, dim = -1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv) + + q = q * self.scale + + dots = torch.einsum('b h i d, b h j d -> b h i j', q, k) + mask_value = max_neg_value(dots) + + if exists(mask): + mask = rearrange(mask, 'b j -> b () () j') + dots.masked_fill_(~mask, mask_value) + del mask + + if self.causal: + i, j = dots.shape[-2:] + mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool() + dots.masked_fill_(mask, mask_value) + + attn = softmax(dots, dim=-1) + + out = torch.einsum('b h i j, b h j d -> b h i d', attn, v) + out = rearrange(out, 'b h n d -> b n (h d)') + out = self.to_out(out) + return out + + +# main transformer class +class Transformer(nn.Module): + def __init__( + self, + *, + dim, + depth, + seq_len, + causal = True, + heads = 8, + dim_head = 64, + ff_mult = 4, + attn_dropout = 0., + ff_dropout = 0., + sparse_attn = False, + sandwich_norm = False, + ): + super().__init__() + layers = nn.ModuleList([]) + sparse_layer = cast_tuple(sparse_attn, depth) + + for ind, sparse_attn in zip(range(depth), sparse_layer): + attn = Attention(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout) + + ff = FeedForward(dim, mult = ff_mult, dropout = ff_dropout) + + layers.append(nn.ModuleList([ + LayerScale(dim, ind + 1, PreNorm(dim, attn, sandwich = sandwich_norm)), + LayerScale(dim, ind + 1, PreNorm(dim, ff, sandwich = sandwich_norm)) + ])) + + execute_type = SequentialSequence + route_attn = ((True, False),) * depth + attn_route_map = {'mask': route_attn} + + self.layers = execute_type(layers, args_route = attn_route_map) + + def forward(self, x, **kwargs): + return self.layers(x, **kwargs) \ No newline at end of file diff --git a/models/vocoder.py b/models/vocoder.py new file mode 100644 index 0000000000000000000000000000000000000000..d38fb56699c035b3d4a86ace67c567d3f1d51fa9 --- /dev/null +++ b/models/vocoder.py @@ -0,0 +1,325 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +MAX_WAV_VALUE = 32768.0 + +class KernelPredictor(torch.nn.Module): + ''' Kernel predictor for the location-variable convolutions''' + + def __init__( + self, + cond_channels, + conv_in_channels, + conv_out_channels, + conv_layers, + conv_kernel_size=3, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + kpnet_nonlinear_activation="LeakyReLU", + kpnet_nonlinear_activation_params={"negative_slope": 0.1}, + ): + ''' + Args: + cond_channels (int): number of channel for the conditioning sequence, + conv_in_channels (int): number of channel for the input sequence, + conv_out_channels (int): number of channel for the output sequence, + conv_layers (int): number of layers + ''' + super().__init__() + + self.conv_in_channels = conv_in_channels + self.conv_out_channels = conv_out_channels + self.conv_kernel_size = conv_kernel_size + self.conv_layers = conv_layers + + kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w + kpnet_bias_channels = conv_out_channels * conv_layers # l_b + + self.input_conv = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.residual_convs = nn.ModuleList() + padding = (kpnet_conv_size - 1) // 2 + for _ in range(3): + self.residual_convs.append( + nn.Sequential( + nn.Dropout(kpnet_dropout), + nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, + bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, + bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + ) + self.kernel_conv = nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_kernel_channels, kpnet_conv_size, padding=padding, bias=True)) + self.bias_conv = nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_bias_channels, kpnet_conv_size, padding=padding, bias=True)) + + def forward(self, c): + ''' + Args: + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + ''' + batch, _, cond_length = c.shape + c = self.input_conv(c) + for residual_conv in self.residual_convs: + residual_conv.to(c.device) + c = c + residual_conv(c) + k = self.kernel_conv(c) + b = self.bias_conv(c) + kernels = k.contiguous().view( + batch, + self.conv_layers, + self.conv_in_channels, + self.conv_out_channels, + self.conv_kernel_size, + cond_length, + ) + bias = b.contiguous().view( + batch, + self.conv_layers, + self.conv_out_channels, + cond_length, + ) + + return kernels, bias + + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.input_conv[0]) + nn.utils.remove_weight_norm(self.kernel_conv) + nn.utils.remove_weight_norm(self.bias_conv) + for block in self.residual_convs: + nn.utils.remove_weight_norm(block[1]) + nn.utils.remove_weight_norm(block[3]) + + +class LVCBlock(torch.nn.Module): + '''the location-variable convolutions''' + + def __init__( + self, + in_channels, + cond_channels, + stride, + dilations=[1, 3, 9, 27], + lReLU_slope=0.2, + conv_kernel_size=3, + cond_hop_length=256, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + ): + super().__init__() + + self.cond_hop_length = cond_hop_length + self.conv_layers = len(dilations) + self.conv_kernel_size = conv_kernel_size + + self.kernel_predictor = KernelPredictor( + cond_channels=cond_channels, + conv_in_channels=in_channels, + conv_out_channels=2 * in_channels, + conv_layers=len(dilations), + conv_kernel_size=conv_kernel_size, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=kpnet_dropout, + kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope} + ) + + self.convt_pre = nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.ConvTranspose1d(in_channels, in_channels, 2 * stride, stride=stride, + padding=stride // 2 + stride % 2, output_padding=stride % 2)), + ) + + self.conv_blocks = nn.ModuleList() + for dilation in dilations: + self.conv_blocks.append( + nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.Conv1d(in_channels, in_channels, conv_kernel_size, + padding=dilation * (conv_kernel_size - 1) // 2, dilation=dilation)), + nn.LeakyReLU(lReLU_slope), + ) + ) + + def forward(self, x, c): + ''' forward propagation of the location-variable convolutions. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length) + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + + Returns: + Tensor: the output sequence (batch, in_channels, in_length) + ''' + _, in_channels, _ = x.shape # (B, c_g, L') + + x = self.convt_pre(x) # (B, c_g, stride * L') + kernels, bias = self.kernel_predictor(c) + + for i, conv in enumerate(self.conv_blocks): + output = conv(x) # (B, c_g, stride * L') + + k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) + b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) + + output = self.location_variable_convolution(output, k, b, + hop_size=self.cond_hop_length) # (B, 2 * c_g, stride * L'): LVC + x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( + output[:, in_channels:, :]) # (B, c_g, stride * L'): GAU + + return x + + def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): + ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. + Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length). + kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) + bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) + dilation (int): the dilation of convolution. + hop_size (int): the hop_size of the conditioning sequence. + Returns: + (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). + ''' + batch, _, in_length = x.shape + batch, _, out_channels, kernel_size, kernel_length = kernel.shape + assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" + + padding = dilation * int((kernel_size - 1) / 2) + x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding) + x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) + + if hop_size < dilation: + x = F.pad(x, (0, dilation), 'constant', 0) + x = x.unfold(3, dilation, + dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) + x = x[:, :, :, :, :hop_size] + x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) + x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) + + o = torch.einsum('bildsk,biokl->bolsd', x, kernel) + o = o.to(memory_format=torch.channels_last_3d) + bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) + o = o + bias + o = o.contiguous().view(batch, out_channels, -1) + + return o + + def remove_weight_norm(self): + self.kernel_predictor.remove_weight_norm() + nn.utils.remove_weight_norm(self.convt_pre[1]) + for block in self.conv_blocks: + nn.utils.remove_weight_norm(block[1]) + + +class UnivNetGenerator(nn.Module): + """UnivNet Generator""" + + def __init__(self, noise_dim=64, channel_size=32, dilations=[1,3,9,27], strides=[8,8,4], lReLU_slope=.2, kpnet_conv_size=3, + # Below are MEL configurations options that this generator requires. + hop_length=256, n_mel_channels=100): + super(UnivNetGenerator, self).__init__() + self.mel_channel = n_mel_channels + self.noise_dim = noise_dim + self.hop_length = hop_length + channel_size = channel_size + kpnet_conv_size = kpnet_conv_size + + self.res_stack = nn.ModuleList() + hop_length = 1 + for stride in strides: + hop_length = stride * hop_length + self.res_stack.append( + LVCBlock( + channel_size, + n_mel_channels, + stride=stride, + dilations=dilations, + lReLU_slope=lReLU_slope, + cond_hop_length=hop_length, + kpnet_conv_size=kpnet_conv_size + ) + ) + + self.conv_pre = \ + nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode='reflect')) + + self.conv_post = nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode='reflect')), + nn.Tanh(), + ) + + def forward(self, c, z): + ''' + Args: + c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length) + z (Tensor): the noise sequence (batch, noise_dim, in_length) + + ''' + z = self.conv_pre(z) # (B, c_g, L) + + for res_block in self.res_stack: + res_block.to(z.device) + z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i) + + z = self.conv_post(z) # (B, 1, L * 256) + + return z + + def eval(self, inference=False): + super(UnivNetGenerator, self).eval() + # don't remove weight norm while validation in training loop + if inference: + self.remove_weight_norm() + + def remove_weight_norm(self): + print('Removing weight norm...') + + nn.utils.remove_weight_norm(self.conv_pre) + + for layer in self.conv_post: + if len(layer.state_dict()) != 0: + nn.utils.remove_weight_norm(layer) + + for res_block in self.res_stack: + res_block.remove_weight_norm() + + def inference(self, c, z=None): + # pad input mel with zeros to cut artifact + # see https://github.com/seungwonpark/melgan/issues/8 + zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) + mel = torch.cat((c, zero), dim=2) + + if z is None: + z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) + + audio = self.forward(mel, z) + audio = audio[:, :, :-(self.hop_length * 10)] + audio = audio.clamp(min=-1, max=1) + return audio + + +if __name__ == '__main__': + model = UnivNetGenerator() + + c = torch.randn(3, 100, 10) + z = torch.randn(3, 64, 10) + print(c.shape) + + y = model(c, z) + print(y.shape) + assert y.shape == torch.Size([3, 1, 2560]) + + pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(pytorch_total_params) diff --git a/models/xtransformers.py b/models/xtransformers.py new file mode 100644 index 0000000000000000000000000000000000000000..70e8e63d3c7069306536331e0ae1421ed6ab89cd --- /dev/null +++ b/models/xtransformers.py @@ -0,0 +1,1253 @@ +import functools +import math +import torch +from torch import nn, einsum +import torch.nn.functional as F +from functools import partial +from inspect import isfunction +from collections import namedtuple + +from einops import rearrange, repeat, reduce +from einops.layers.torch import Rearrange + +from entmax import entmax15 +from torch.utils.checkpoint import checkpoint + +DEFAULT_DIM_HEAD = 64 + +Intermediates = namedtuple('Intermediates', [ + 'pre_softmax_attn', + 'post_softmax_attn' +]) + +LayerIntermediates = namedtuple('Intermediates', [ + 'hiddens', + 'attn_intermediates', + 'past_key_values', +]) + + +# helpers + +def exists(val): + return val is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def cast_tuple(val, depth): + return val if isinstance(val, tuple) else (val,) * depth + + +class always(): + def __init__(self, val): + self.val = val + + def __call__(self, *args, **kwargs): + return self.val + + +class not_equals(): + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x != self.val + + +class equals(): + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x == self.val + + +def max_neg_value(tensor): + return -torch.finfo(tensor.dtype).max + + +def l2norm(t): + return F.normalize(t, p=2, dim=-1) + + +# init helpers + +def init_zero_(layer): + nn.init.constant_(layer.weight, 0.) + if exists(layer.bias): + nn.init.constant_(layer.bias, 0.) + + +# keyword argument helpers + +def pick_and_pop(keys, d): + values = list(map(lambda key: d.pop(key), keys)) + return dict(zip(keys, values)) + + +def group_dict_by_key(cond, d): + return_val = [dict(), dict()] + for key in d.keys(): + match = bool(cond(key)) + ind = int(not match) + return_val[ind][key] = d[key] + return (*return_val,) + + +def string_begins_with(prefix, str): + return str.startswith(prefix) + + +def group_by_key_prefix(prefix, d): + return group_dict_by_key(partial(string_begins_with, prefix), d) + + +def groupby_prefix_and_trim(prefix, d): + kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) + kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items()))) + return kwargs_without_prefix, kwargs + + +# activations + +class ReluSquared(nn.Module): + def forward(self, x): + return F.relu(x) ** 2 + + +# positional embeddings + +class AbsolutePositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + self.scale = dim ** -0.5 + self.emb = nn.Embedding(max_seq_len, dim) + + def forward(self, x): + n = torch.arange(x.shape[1], device=x.device) + pos_emb = self.emb(n) + pos_emb = rearrange(pos_emb, 'n d -> () n d') + return pos_emb * self.scale + + +class FixedPositionalEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, x, seq_dim=1, offset=0): + t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset + sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) + return rearrange(emb, 'n d -> () n d') + + +class RelativePositionBias(nn.Module): + def __init__(self, scale, causal=False, num_buckets=32, max_distance=128, heads=8): + super().__init__() + self.scale = scale + self.causal = causal + self.num_buckets = num_buckets + self.max_distance = max_distance + self.relative_attention_bias = nn.Embedding(num_buckets, heads) + + @staticmethod + def _relative_position_bucket(relative_position, causal=True, num_buckets=32, max_distance=128): + ret = 0 + n = -relative_position + if not causal: + num_buckets //= 2 + ret += (n < 0).long() * num_buckets + n = torch.abs(n) + else: + n = torch.max(n, torch.zeros_like(n)) + + max_exact = num_buckets // 2 + is_small = n < max_exact + + val_if_large = max_exact + ( + torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + ).long() + val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) + + ret += torch.where(is_small, n, val_if_large) + return ret + + def forward(self, qk_dots): + i, j, device = *qk_dots.shape[-2:], qk_dots.device + q_pos = torch.arange(i, dtype=torch.long, device=device) + k_pos = torch.arange(j, dtype=torch.long, device=device) + rel_pos = k_pos[None, :] - q_pos[:, None] + rp_bucket = self._relative_position_bucket(rel_pos, causal=self.causal, num_buckets=self.num_buckets, + max_distance=self.max_distance) + values = self.relative_attention_bias(rp_bucket) + bias = rearrange(values, 'i j h -> () h i j') + return qk_dots + (bias * self.scale) + + +class AlibiPositionalBias(nn.Module): + def __init__(self, heads, **kwargs): + super().__init__() + self.heads = heads + slopes = torch.Tensor(self._get_slopes(heads)) + slopes = rearrange(slopes, 'h -> () h () ()') + self.register_buffer('slopes', slopes, persistent=False) + self.register_buffer('bias', None, persistent=False) + + @staticmethod + def _get_slopes(heads): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(heads).is_integer(): + return get_slopes_power_of_2(heads) + + closest_power_of_2 = 2 ** math.floor(math.log2(heads)) + return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][ + :heads - closest_power_of_2] + + def forward(self, qk_dots): + h, i, j, device = *qk_dots.shape[-3:], qk_dots.device + + if exists(self.bias) and self.bias.shape[-1] >= j: + return qk_dots + self.bias[..., :j] + + bias = torch.arange(j, device=device) + bias = rearrange(bias, 'j -> () () () j') + bias = bias * self.slopes + + num_heads_unalibied = h - bias.shape[1] + bias = F.pad(bias, (0, 0, 0, 0, 0, num_heads_unalibied)) + + self.register_buffer('bias', bias, persistent=False) + return qk_dots + self.bias + + +class LearnedAlibiPositionalBias(AlibiPositionalBias): + def __init__(self, heads, bidirectional=False): + super().__init__(heads) + los_slopes = torch.log(self.slopes) + self.learned_logslopes = nn.Parameter(los_slopes) + + self.bidirectional = bidirectional + if self.bidirectional: + self.learned_logslopes_future = nn.Parameter(los_slopes) + + def forward(self, qk_dots): + h, i, j, device = *qk_dots.shape[-3:], qk_dots.device + + def get_slopes(param): + return F.pad(param.exp(), (0, 0, 0, 0, 0, h - param.shape[1])) + + if exists(self.bias) and self.bias.shape[-1] >= j: + bias = self.bias[..., :i, :j] + else: + i_arange = torch.arange(i, device=device) + j_arange = torch.arange(j, device=device) + bias = rearrange(j_arange, 'j -> 1 1 1 j') - rearrange(i_arange, 'i -> 1 1 i 1') + self.register_buffer('bias', bias, persistent=False) + + if self.bidirectional: + past_slopes = get_slopes(self.learned_logslopes) + future_slopes = get_slopes(self.learned_logslopes_future) + bias = torch.tril(bias * past_slopes) + torch.triu(bias * future_slopes) + else: + slopes = get_slopes(self.learned_logslopes) + bias = bias * slopes + + return qk_dots + bias + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, max_seq_len, device): + t = torch.arange(max_seq_len, device=device).type_as(self.inv_freq) + freqs = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + return rearrange(emb, 'n d -> () () n d') + + +def rotate_half(x): + x = rearrange(x, '... (j d) -> ... j d', j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(t, freqs): + seq_len = t.shape[-2] + freqs = freqs[:, :, -seq_len:] + return (t * freqs.cos()) + (rotate_half(t) * freqs.sin()) + + +# norms + +class Scale(nn.Module): + def __init__(self, value, fn): + super().__init__() + self.value = value + self.fn = fn + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + scale_fn = lambda t: t * self.value + + if not isinstance(out, tuple): + return scale_fn(out) + + return (scale_fn(out[0]), *out[1:]) + + +class Rezero(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = nn.Parameter(torch.zeros(1)) + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + rezero_fn = lambda t: t * self.g + + if not isinstance(out, tuple): + return rezero_fn(out) + + return (rezero_fn(out[0]), *out[1:]) + + +class ScaleNorm(nn.Module): + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(1)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSScaleShiftNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + self.scale_shift_process = nn.Linear(dim * 2, dim * 2) + + def forward(self, x, norm_scale_shift_inp): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + norm = x / norm.clamp(min=self.eps) * self.g + + ss_emb = self.scale_shift_process(norm_scale_shift_inp) + scale, shift = torch.chunk(ss_emb, 2, dim=1) + h = norm * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + return h + + +# residual and residual gates + +class Residual(nn.Module): + def __init__(self, dim, scale_residual=False): + super().__init__() + self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None + + def forward(self, x, residual): + if exists(self.residual_scale): + residual = residual * self.residual_scale + + return x + residual + + +class GRUGating(nn.Module): + def __init__(self, dim, scale_residual=False): + super().__init__() + self.gru = nn.GRUCell(dim, dim) + self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None + + def forward(self, x, residual): + if exists(self.residual_scale): + residual = residual * self.residual_scale + + gated_output = self.gru( + rearrange(x, 'b n d -> (b n) d'), + rearrange(residual, 'b n d -> (b n) d') + ) + + return gated_output.reshape_as(x) + + +# token shifting + +def shift(t, amount, mask=None): + if amount == 0: + return t + + if exists(mask): + t = t.masked_fill(~mask[..., None], 0.) + + return F.pad(t, (0, 0, amount, -amount), value=0.) + + +class ShiftTokens(nn.Module): + def __init__(self, shifts, fn): + super().__init__() + self.fn = fn + self.shifts = tuple(shifts) + + def forward(self, x, **kwargs): + mask = kwargs.get('mask', None) + shifts = self.shifts + segments = len(shifts) + feats_per_shift = x.shape[-1] // segments + splitted = x.split(feats_per_shift, dim=-1) + segments_to_shift, rest = splitted[:segments], splitted[segments:] + segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts))) + x = torch.cat((*segments_to_shift, *rest), dim=-1) + return self.fn(x, **kwargs) + + +# feedforward + +class GLU(nn.Module): + def __init__(self, dim_in, dim_out, activation): + super().__init__() + self.act = activation + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * self.act(gate) + + +class FeedForward(nn.Module): + def __init__( + self, + dim, + dim_out=None, + mult=4, + glu=False, + relu_squared=False, + post_act_ln=False, + dropout=0., + zero_init_output=False + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + activation = ReluSquared() if relu_squared else nn.GELU() + + project_in = nn.Sequential( + nn.Linear(dim, inner_dim), + activation + ) if not glu else GLU(dim, inner_dim, activation) + + self.net = nn.Sequential( + project_in, + nn.LayerNorm(inner_dim) if post_act_ln else nn.Identity(), + nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out) + ) + + # init last linear layer to 0 + if zero_init_output: + init_zero_(self.net[-1]) + + def forward(self, x): + return self.net(x) + + +# attention. + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_head=DEFAULT_DIM_HEAD, + heads=8, + causal=False, + talking_heads=False, + head_scale=False, + collab_heads=False, + collab_compression=.3, + sparse_topk=None, + use_entmax15=False, + num_mem_kv=0, + dropout=0., + on_attn=False, + gate_values=False, + zero_init_output=False, + max_attend_past=None, + qk_norm=False, + scale_init_value=None, + rel_pos_bias=False, + rel_pos_num_buckets=32, + rel_pos_max_distance=128, + ): + super().__init__() + self.scale = dim_head ** -0.5 + + self.heads = heads + self.causal = causal + self.max_attend_past = max_attend_past + + qk_dim = v_dim = dim_head * heads + + # collaborative heads + self.collab_heads = collab_heads + if self.collab_heads: + qk_dim = int(collab_compression * qk_dim) + self.collab_mixing = nn.Parameter(torch.randn(heads, qk_dim)) + + self.to_q = nn.Linear(dim, qk_dim, bias=False) + self.to_k = nn.Linear(dim, qk_dim, bias=False) + self.to_v = nn.Linear(dim, v_dim, bias=False) + + self.dropout = nn.Dropout(dropout) + + # add GLU gating for aggregated values, from alphafold2 + self.to_v_gate = None + if gate_values: + self.to_v_gate = nn.Linear(dim, v_dim) + nn.init.constant_(self.to_v_gate.weight, 0) + nn.init.constant_(self.to_v_gate.bias, 1) + + # cosine sim attention + self.qk_norm = qk_norm + if qk_norm: + scale_init_value = default(scale_init_value, + -3) # if not provided, initialize as though it were sequence length of 1024 + self.scale = nn.Parameter(torch.ones(1, heads, 1, 1) * scale_init_value) + + # talking heads + self.talking_heads = talking_heads + if talking_heads: + self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + + # head scaling + self.head_scale = head_scale + if head_scale: + self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1)) + + # explicit topk sparse attention + self.sparse_topk = sparse_topk + + # entmax + self.attn_fn = entmax15 if use_entmax15 else F.softmax + + # add memory key / values + self.num_mem_kv = num_mem_kv + if num_mem_kv > 0: + self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + + # attention on attention + self.attn_on_attn = on_attn + self.to_out = nn.Sequential(nn.Linear(v_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(v_dim, dim) + + self.rel_pos_bias = rel_pos_bias + if rel_pos_bias: + assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance' + self.rel_pos = RelativePositionBias(scale=dim_head ** 0.5, causal=causal, heads=heads, + num_buckets=rel_pos_num_buckets, max_distance=rel_pos_max_distance) + + # init output projection 0 + if zero_init_output: + init_zero_(self.to_out) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + attn_mask=None, + sinusoidal_emb=None, + rotary_pos_emb=None, + prev_attn=None, + mem=None, + layer_past=None, + ): + b, n, _, h, talking_heads, collab_heads, head_scale, scale, device, has_context = *x.shape, self.heads, self.talking_heads, self.collab_heads, self.head_scale, self.scale, x.device, exists( + context) + kv_input = default(context, x) + + q_input = x + k_input = kv_input + v_input = kv_input + + if exists(mem): + k_input = torch.cat((mem, k_input), dim=-2) + v_input = torch.cat((mem, v_input), dim=-2) + + if exists(sinusoidal_emb): + # in shortformer, the query would start at a position offset depending on the past cached memory + offset = k_input.shape[-2] - q_input.shape[-2] + q_input = q_input + sinusoidal_emb(q_input, offset=offset) + k_input = k_input + sinusoidal_emb(k_input) + + q = self.to_q(q_input) + k = self.to_k(k_input) + v = self.to_v(v_input) + + if not collab_heads: + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) + else: + q = einsum('b i d, h d -> b h i d', q, self.collab_mixing) + k = rearrange(k, 'b n d -> b () n d') + v = rearrange(v, 'b n (h d) -> b h n d', h=h) + + if layer_past is not None: + past_key, past_value = layer_past + k = torch.cat([past_key, k], dim=-2) + v = torch.cat([past_value, v], dim=-2) + k_cache = k + v_cache = v + + if exists(rotary_pos_emb) and not has_context: + l = rotary_pos_emb.shape[-1] + (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v)) + ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl)) + q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr))) + + input_mask = None + if any(map(exists, (mask, context_mask))): + q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool()) + k_mask = q_mask if not exists(context) else context_mask + k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool()) + q_mask = rearrange(q_mask, 'b i -> b () i ()') + k_mask = rearrange(k_mask, 'b j -> b () () j') + input_mask = q_mask * k_mask + + if self.num_mem_kv > 0: + mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v)) + k = torch.cat((mem_k, k), dim=-2) + v = torch.cat((mem_v, v), dim=-2) + if exists(input_mask): + input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True) + + if collab_heads: + k = k.expand(-1, h, -1, -1) + + if self.qk_norm: + q, k = map(l2norm, (q, k)) + scale = 1 / (self.scale.exp().clamp(min=1e-2)) + + dots = einsum('b h i d, b h j d -> b h i j', q, k) * scale + mask_value = max_neg_value(dots) + + if exists(prev_attn): + dots = dots + prev_attn + + pre_softmax_attn = dots.clone() + + if talking_heads: + dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous() + + if self.rel_pos_bias: + dots = self.rel_pos(dots) + + if exists(input_mask): + dots.masked_fill_(~input_mask, mask_value) + del input_mask + + if exists(attn_mask): + assert 2 <= attn_mask.ndim <= 4, 'attention mask must have greater than 2 dimensions but less than or equal to 4' + if attn_mask.ndim == 2: + attn_mask = rearrange(attn_mask, 'i j -> () () i j') + elif attn_mask.ndim == 3: + attn_mask = rearrange(attn_mask, 'h i j -> () h i j') + dots.masked_fill_(~attn_mask, mask_value) + + if exists(self.max_attend_past): + i, j = dots.shape[-2:] + range_q = torch.arange(j - i, j, device=device) + range_k = torch.arange(j, device=device) + dist = rearrange(range_q, 'i -> () () i ()') - rearrange(range_k, 'j -> () () () j') + mask = dist > self.max_attend_past + dots.masked_fill_(mask, mask_value) + del mask + + if self.causal: + i, j = dots.shape[-2:] + r = torch.arange(i, device=device) + mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j') + mask = F.pad(mask, (j - i, 0), value=False) + dots.masked_fill_(mask, mask_value) + del mask + + if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: + top, _ = dots.topk(self.sparse_topk, dim=-1) + vk = top[..., -1].unsqueeze(-1).expand_as(dots) + mask = dots < vk + dots.masked_fill_(mask, mask_value) + del mask + + attn = self.attn_fn(dots, dim=-1) + post_softmax_attn = attn.clone() + + attn = self.dropout(attn) + + if talking_heads: + attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous() + + out = einsum('b h i j, b h j d -> b h i d', attn, v) + + if head_scale: + out = out * self.head_scale_params + + out = rearrange(out, 'b h n d -> b n (h d)') + + if exists(self.to_v_gate): + gates = self.to_v_gate(x) + out = out * gates.sigmoid() + + intermediates = Intermediates( + pre_softmax_attn=pre_softmax_attn, + post_softmax_attn=post_softmax_attn + ) + + return self.to_out(out), intermediates, k_cache, v_cache + + +class AttentionLayers(nn.Module): + def __init__( + self, + dim, + depth, + heads=8, + causal=False, + cross_attend=False, + only_cross=False, + use_scalenorm=False, + use_rms_scaleshift_norm=False, + use_rmsnorm=False, + use_rezero=False, + alibi_pos_bias=False, + alibi_num_heads=None, + alibi_learned=False, + position_infused_attn=False, + rotary_pos_emb=False, + rotary_emb_dim=None, + custom_layers=None, + sandwich_coef=None, + par_ratio=None, + residual_attn=False, + cross_residual_attn=False, + macaron=False, + pre_norm=True, + gate_residual=False, + scale_residual=False, + shift_tokens=0, + sandwich_norm=False, + use_qk_norm_attn=False, + qk_norm_attn_seq_len=None, + zero_init_branch_output=False, + **kwargs + ): + super().__init__() + ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs) + attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs) + + dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD) + + self.dim = dim + self.depth = depth + self.layers = nn.ModuleList([]) + self.causal = causal + + rel_pos_bias = 'rel_pos_bias' in attn_kwargs + self.has_pos_emb = position_infused_attn or rel_pos_bias or rotary_pos_emb + self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None + + rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) + self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None + + assert not ( + alibi_pos_bias and rel_pos_bias), 'you can only choose Alibi positional bias or T5 relative positional bias, not both' + + if alibi_pos_bias: + alibi_num_heads = default(alibi_num_heads, heads) + assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads' + alibi_pos_klass = LearnedAlibiPositionalBias if alibi_learned or not causal else AlibiPositionalBias + self.rel_pos = alibi_pos_klass(heads=alibi_num_heads, bidirectional=not causal) + else: + self.rel_pos = None + + assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm' + self.pre_norm = pre_norm + self.sandwich_norm = sandwich_norm + + self.residual_attn = residual_attn + self.cross_residual_attn = cross_residual_attn + self.cross_attend = cross_attend + + norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm + norm_class = RMSNorm if use_rmsnorm else norm_class + norm_class = RMSScaleShiftNorm if use_rms_scaleshift_norm else norm_class + norm_fn = partial(norm_class, dim) + + norm_fn = nn.Identity if use_rezero else norm_fn + branch_fn = Rezero if use_rezero else None + + if cross_attend and not only_cross: + default_block = ('a', 'c', 'f') + elif cross_attend and only_cross: + default_block = ('c', 'f') + else: + default_block = ('a', 'f') + + if macaron: + default_block = ('f',) + default_block + + # qk normalization + + if use_qk_norm_attn: + attn_scale_init_value = -math.log(math.log2(qk_norm_attn_seq_len ** 2 - qk_norm_attn_seq_len)) if exists( + qk_norm_attn_seq_len) else None + attn_kwargs = {**attn_kwargs, 'qk_norm': True, 'scale_init_value': attn_scale_init_value} + + # zero init + + if zero_init_branch_output: + attn_kwargs = {**attn_kwargs, 'zero_init_output': True} + ff_kwargs = {**ff_kwargs, 'zero_init_output': True} + + # calculate layer block order + + if exists(custom_layers): + layer_types = custom_layers + elif exists(par_ratio): + par_depth = depth * len(default_block) + assert 1 < par_ratio <= par_depth, 'par ratio out of range' + default_block = tuple(filter(not_equals('f'), default_block)) + par_attn = par_depth // par_ratio + depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper + par_width = (depth_cut + depth_cut // par_attn) // par_attn + assert len(default_block) <= par_width, 'default block is too large for par_ratio' + par_block = default_block + ('f',) * (par_width - len(default_block)) + par_head = par_block * par_attn + layer_types = par_head + ('f',) * (par_depth - len(par_head)) + elif exists(sandwich_coef): + assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth' + layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef + else: + layer_types = default_block * depth + + self.layer_types = layer_types + self.num_attn_layers = len(list(filter(equals('a'), layer_types))) + + # calculate token shifting + + shift_tokens = cast_tuple(shift_tokens, len(layer_types)) + + # iterate and construct layers + + for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)): + is_last_layer = ind == (len(self.layer_types) - 1) + + if layer_type == 'a': + layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs) + elif layer_type == 'c': + layer = Attention(dim, heads=heads, **attn_kwargs) + elif layer_type == 'f': + layer = FeedForward(dim, **ff_kwargs) + layer = layer if not macaron else Scale(0.5, layer) + else: + raise Exception(f'invalid layer type {layer_type}') + + if layer_shift_tokens > 0: + shift_range_upper = layer_shift_tokens + 1 + shift_range_lower = -layer_shift_tokens if not causal else 0 + layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer) + + if exists(branch_fn): + layer = branch_fn(layer) + + residual_fn = GRUGating if gate_residual else Residual + residual = residual_fn(dim, scale_residual=scale_residual) + + layer_uses_qk_norm = use_qk_norm_attn and layer_type in ('a', 'c') + + pre_branch_norm = norm_fn() if pre_norm and not layer_uses_qk_norm else None + post_branch_norm = norm_fn() if sandwich_norm or layer_uses_qk_norm else None + post_main_norm = norm_fn() if not pre_norm and not is_last_layer else None + + norms = nn.ModuleList([ + pre_branch_norm, + post_branch_norm, + post_main_norm + ]) + + self.layers.append(nn.ModuleList([ + norms, + layer, + residual + ])) + + def forward( + self, + x, + context=None, + full_context=None, # for passing a list of hidden states from an encoder + mask=None, + context_mask=None, + attn_mask=None, + mems=None, + return_hiddens=False, + norm_scale_shift_inp=None, + past_key_values=None, + expected_seq_len=None, + ): + + assert not (self.cross_attend ^ (exists(context) or exists( + full_context))), 'context must be passed in if cross_attend is set to True' + assert context is None or full_context is None, 'only one of full_context or context can be provided' + + hiddens = [] + intermediates = [] + prev_attn = None + prev_cross_attn = None + + mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers + norm_args = {} + if exists(norm_scale_shift_inp): + norm_args['norm_scale_shift_inp'] = norm_scale_shift_inp + + rotary_pos_emb = None + if exists(self.rotary_pos_emb): + if not self.training and self.causal: + assert expected_seq_len is not None, "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + elif expected_seq_len is None: + expected_seq_len = 0 + seq_len = x.shape[1] + if past_key_values is not None: + seq_len += past_key_values[0][0].shape[-2] + max_rotary_emb_length = max(list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len]) + rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device) + + present_key_values = [] + cross_attn_count = 0 + for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)): + if layer_type == 'a': + layer_mem = mems.pop(0) if mems else None + + residual = x + + pre_branch_norm, post_branch_norm, post_main_norm = norm + + if exists(pre_branch_norm): + x = pre_branch_norm(x, **norm_args) + + if layer_type == 'a' or layer_type == 'c': + if past_key_values is not None: + layer_kv = past_key_values.pop(0) + layer_past = tuple(s.to(x.device) for s in layer_kv) + else: + layer_past = None + + if layer_type == 'a': + out, inter, k, v = checkpoint(block, x, None, mask, None, attn_mask, self.pia_pos_emb, rotary_pos_emb, + prev_attn, layer_mem, layer_past) + elif layer_type == 'c': + if exists(full_context): + out, inter, k, v = checkpoint(block, x, full_context[cross_attn_count], mask, context_mask, None, None, + None, prev_attn, None, layer_past) + else: + out, inter, k, v = checkpoint(block, x, context, mask, context_mask, None, None, None, prev_attn, None, layer_past) + elif layer_type == 'f': + out = checkpoint(block, x) + + if layer_type == 'a' or layer_type == 'c' and present_key_values is not None: + present_key_values.append((k.detach(), v.detach())) + + if exists(post_branch_norm): + out = post_branch_norm(out, **norm_args) + + x = residual_fn(out, residual) + + if layer_type in ('a', 'c'): + intermediates.append(inter) + + if layer_type == 'a' and self.residual_attn: + prev_attn = inter.pre_softmax_attn + elif layer_type == 'c' and self.cross_residual_attn: + prev_cross_attn = inter.pre_softmax_attn + + if exists(post_main_norm): + x = post_main_norm(x, **norm_args) + + if layer_type == 'c': + cross_attn_count += 1 + + if layer_type == 'f': + hiddens.append(x) + + if return_hiddens: + intermediates = LayerIntermediates( + hiddens=hiddens, + attn_intermediates=intermediates, + past_key_values=present_key_values + ) + + return x, intermediates + + return x + + +class Encoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on encoder' + super().__init__(causal=False, **kwargs) + + +class Decoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on decoder' + super().__init__(causal=True, **kwargs) + + +class CrossAttender(AttentionLayers): + def __init__(self, **kwargs): + super().__init__(cross_attend=True, only_cross=True, **kwargs) + + +class ViTransformerWrapper(nn.Module): + def __init__( + self, + *, + image_size, + patch_size, + attn_layers, + num_classes=None, + dropout=0., + emb_dropout=0. + ): + super().__init__() + assert isinstance(attn_layers, Encoder), 'attention layers must be an Encoder' + assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size' + dim = attn_layers.dim + num_patches = (image_size // patch_size) ** 2 + patch_dim = 3 * patch_size ** 2 + + self.patch_size = patch_size + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) + self.patch_to_embedding = nn.Linear(patch_dim, dim) + self.cls_token = nn.Parameter(torch.randn(1, 1, dim)) + self.dropout = nn.Dropout(emb_dropout) + + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + self.mlp_head = FeedForward(dim, dim_out=num_classes, dropout=dropout) if exists(num_classes) else None + + def forward( + self, + img, + return_embeddings=False + ): + p = self.patch_size + + x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p) + x = self.patch_to_embedding(x) + b, n, _ = x.shape + + cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embedding[:, :(n + 1)] + x = self.dropout(x) + + x = self.attn_layers(x) + x = self.norm(x) + + if not exists(self.mlp_head) or return_embeddings: + return x + + return self.mlp_head(x[:, 0]) + + +class TransformerWrapper(nn.Module): + def __init__( + self, + *, + num_tokens, + max_seq_len, + attn_layers, + emb_dim=None, + max_mem_len=0., + shift_mem_down=0, + emb_dropout=0., + num_memory_tokens=None, + tie_embedding=False, + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + emb_dim = default(emb_dim, dim) + + self.max_seq_len = max_seq_len + self.max_mem_len = max_mem_len + self.shift_mem_down = shift_mem_down + + self.token_emb = nn.Embedding(num_tokens, emb_dim) + self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity() + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.init_() + + self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t() + + # memory tokens (like [cls]) from Memory Transformers paper + num_memory_tokens = default(num_memory_tokens, 0) + self.num_memory_tokens = num_memory_tokens + if num_memory_tokens > 0: + self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim)) + + def init_(self): + nn.init.kaiming_normal_(self.token_emb.weight) + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_hiddens=False, + return_attn=False, + mems=None, + use_cache=False, + **kwargs + ): + b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens + x = self.token_emb(x) + x = x + self.pos_emb(x) + x = self.emb_dropout(x) + + x = self.project_emb(x) + + if num_mem > 0: + mem = repeat(self.memory_tokens, 'n d -> b n d', b=b) + x = torch.cat((mem, x), dim=1) + + # auto-handle masking after appending memory tokens + if exists(mask): + mask = F.pad(mask, (num_mem, 0), value=True) + + if self.shift_mem_down and exists(mems): + mems_l, mems_r = mems[:self.shift_mem_down], mems[self.shift_mem_down:] + mems = [*mems_r, *mems_l] + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + mem, x = x[:, :num_mem], x[:, num_mem:] + + out = self.to_logits(x) if not return_embeddings else x + + if return_hiddens: + hiddens = intermediates.hiddens + return out, hiddens + + res = [out] + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + res.append(attn_maps) + if use_cache: + res.append(intermediates.past_key_values) + + if len(res) > 1: + return tuple(res) + return res[0] + + +class ContinuousTransformerWrapper(nn.Module): + def __init__( + self, + *, + max_seq_len, + attn_layers, + dim_in=None, + dim_out=None, + emb_dim=None, + emb_dropout=0., + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + + self.max_seq_len = max_seq_len + + self.pos_emb = AbsolutePositionalEmbedding(dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_in = nn.Linear(dim_in, dim) if exists(dim_in) else nn.Identity() + + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.project_out = nn.Linear(dim, dim_out) if exists(dim_out) else nn.Identity() + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_attn=False, + mems=None, + use_cache=False, + **kwargs + ): + b, n, _, device = *x.shape, x.device + + x = self.project_in(x) + x = x + self.pos_emb(x) + x = self.emb_dropout(x) + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + out = self.project_out(x) if not return_embeddings else x + + res = [out] + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + res.append(attn_maps) + if use_cache: + res.append(intermediates.past_key_values) + + if len(res) > 1: + return tuple(res) + return res[0] + diff --git a/read.py b/read.py new file mode 100644 index 0000000000000000000000000000000000000000..9e4e04c0852e9f2ac302b99943233abecd8ff30e --- /dev/null +++ b/read.py @@ -0,0 +1,87 @@ +import argparse +import os + +import torch +import torch.nn.functional as F +import torchaudio + +from api import TextToSpeech, format_conditioning +from utils.audio import load_audio, get_voices +from utils.tokenizer import VoiceBpeTokenizer + + +def split_and_recombine_text(texts, desired_length=200, max_len=300): + # TODO: also split across '!' and '?'. Attempt to keep quotations together. + texts = [s.strip() + "." for s in texts.split('.')] + + i = 0 + while i < len(texts): + ltxt = texts[i] + if len(ltxt) >= desired_length or i == len(texts)-1: + i += 1 + continue + if len(ltxt) + len(texts[i+1]) > max_len: + i += 1 + continue + texts[i] = f'{ltxt} {texts[i+1]}' + texts.pop(i+1) + return texts + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt") + parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' + 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat') + parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/') + parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard') + parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None) + parser.add_argument('--voice_diversity_intelligibility_slider', type=float, + help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility', + default=.5) + args = parser.parse_args() + + outpath = args.output_path + voices = get_voices() + selected_voices = args.voice.split(',') + regenerate = args.regenerate + if regenerate is not None: + regenerate = [int(e) for e in regenerate.split(',')] + for selected_voice in selected_voices: + voice_outpath = os.path.join(outpath, selected_voice) + os.makedirs(voice_outpath, exist_ok=True) + + with open(args.textfile, 'r', encoding='utf-8') as f: + text = ''.join([l for l in f.readlines()]) + texts = split_and_recombine_text(text) + tts = TextToSpeech() + + if '&' in selected_voice: + voice_sel = selected_voice.split('&') + else: + voice_sel = [selected_voice] + cond_paths = [] + for vsel in voice_sel: + if vsel not in voices.keys(): + print(f'Error: voice {vsel} not available. Skipping.') + continue + cond_paths.extend(voices[vsel]) + if not cond_paths: + print('Error: no valid voices specified. Try again.') + + conds = [] + for cond_path in cond_paths: + c = load_audio(cond_path, 22050) + conds.append(c) + all_parts = [] + for j, text in enumerate(texts): + if regenerate is not None and j not in regenerate: + all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000)) + continue + gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider) + gen = gen.squeeze(0).cpu() + torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000) + all_parts.append(gen) + full_audio = torch.cat(all_parts, dim=-1) + torchaudio.save(os.path.join(voice_outpath, 'combined.wav'), full_audio, 24000) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0d398e915c16b654d9efa055f46a7e983ad1a29 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +tqdm +rotary_embedding_torch +transformers +tokenizers +inflect +progressbar +einops +unidecode +scipy +librosa \ No newline at end of file diff --git a/samples_generator.py b/samples_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e2b36d3906b9ebc7232633ce49fb7771523d9f7a --- /dev/null +++ b/samples_generator.py @@ -0,0 +1,51 @@ +import os + +# This script builds the sample webpage. + +if __name__ == '__main__': + result = "These words were never spoken.

Handpicked results

" + for fv in os.listdir('results/favorites'): + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/favorites/{fv}' + result = result + f'
\n' + + result = result + "

Handpicked longform result:

" + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/favorite_riding_hood.mp3' + result = result + f'
\n' + + result = result + "

Compared to Tacotron2 (with the LJSpeech voice):

" + for k in range(2,5,1): + url1 = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/tacotron_comparison/{k}-tacotron2.mp3' + url2 = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/tacotron_comparison/{k}-tortoise.mp3' + result = result + f'' \ + f'' + result = result + "
Tacotron2+WaveglowTorToiSe

\n

\n
" + + result = result + "

Various spoken texts for all voices:

" + voices = ['angie', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'myself', + 'pat', 'snakes', 'tom', 'train_atkins', 'train_dotrice', 'train_kennard', 'weaver', 'william'] + lines = ['' + ''.join([f'' for v in voices])] + line = f'' + for v in voices: + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/voices/{v}/1.wav' + line = line + f'' + line = line + "" + lines.append(line) + for txt in os.listdir('results/various/'): + if 'desktop' in txt: + continue + line = f'' + for v in voices: + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/various/{txt}/{v}.mp3' + line = line + f'' + line = line + "" + lines.append(line) + result = result + '\n'.join(lines) + "
text{v}
reference clip
{txt}
" + + result = result + "

Longform result for all voices:

" + for lf in os.listdir('results/riding_hood'): + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/riding_hood/{lf}' + result = result + f'
\n' + + result = result + "" + with open('result.html', 'w', encoding='utf-8') as f: + f.write(result) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..019e48d35f1e8747d7ea489f2d4911790255c225 --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setuptools.setup( + name="TorToiSe", + packages=setuptools.find_packages(), + version="2.1.3", + author="James Betker", + author_email="james@adamant.ai", + description="A high quality multi-voice text-to-speech library", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/neonbjb/tortoise-tts", + project_urls={}, + install_requires=[ + 'tqdm', + 'rotary_embedding_torch', + 'inflect', + 'progressbar', + 'einops', + 'unidecode', + 'scipy', + 'librosa', + 'transformers', + 'tokenizers', + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", +) \ No newline at end of file diff --git a/sweep.py b/sweep.py new file mode 100644 index 0000000000000000000000000000000000000000..bc72fec51ce0fea14479ca65a0bb42ad4889f4e9 --- /dev/null +++ b/sweep.py @@ -0,0 +1,65 @@ +import os +from random import shuffle + +import torchaudio + +from api import TextToSpeech +from utils.audio import load_audio + + +def permutations(args): + res = [] + k = next(iter(args.keys())) + vals = args[k] + del args[k] + if not args: + return [{k: v} for v in vals] + lower = permutations(args) + for v in vals: + for l in lower: + lc = l.copy() + lc[k] = v + res.append(lc) + return res + + +if __name__ == '__main__': + fname = 'Y:\\clips\\books2\\subset512-oco.tsv' + stop_after = 512 + outpath_base = 'D:\\tmp\\tortoise-tts-eval\\sweep-2' + outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real' + + arg_ranges = { + 'top_p': [.8,1], + 'temperature': [.8,.9,1], + 'diffusion_temperature': [.8,1], + 'cond_free_k': [1,2,5,10], + } + cfgs = permutations(arg_ranges) + shuffle(cfgs) + + for cfg in cfgs: + cfg_desc = '_'.join([f'{k}-{v}' for k,v in cfg.items()]) + outpath = os.path.join(outpath_base, f'{cfg_desc}') + os.makedirs(outpath, exist_ok=True) + os.makedirs(outpath_real, exist_ok=True) + with open(fname, 'r', encoding='utf-8') as f: + lines = [l.strip().split('\t') for l in f.readlines()] + + recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8') + tts = TextToSpeech() + for e, line in enumerate(lines): + if e >= stop_after: + break + transcript = line[0] + path = os.path.join(os.path.dirname(fname), line[1]) + cond_audio = load_audio(path, 22050) + torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050) + sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=32, repetition_penalty=2.0, + k=1, diffusion_iterations=32, length_penalty=1.0, **cfg) + down = torchaudio.functional.resample(sample, 24000, 22050) + fout_path = os.path.join(outpath, os.path.basename(line[1])) + torchaudio.save(fout_path, down.squeeze(0), 22050) + recorder.write(f'{transcript}\t{fout_path}\n') + recorder.flush() + recorder.close() \ No newline at end of file diff --git a/tortoise/__init__.py b/tortoise/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tortoise/api.py b/tortoise/api.py new file mode 100644 index 0000000000000000000000000000000000000000..65c7d6e0ad7075cbf598782a211178302c6e83ff --- /dev/null +++ b/tortoise/api.py @@ -0,0 +1,454 @@ +import os +import random +import uuid +from urllib import request + +import torch +import torch.nn.functional as F +import progressbar +import torchaudio + +from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead +from tortoise.models.cvvp import CVVP +from tortoise.models.diffusion_decoder import DiffusionTts +from tortoise.models.autoregressive import UnifiedVoice +from tqdm import tqdm + +from tortoise.models.arch_util import TorchMelSpectrogram +from tortoise.models.clvp import CLVP +from tortoise.models.random_latent_generator import RandomLatentConverter +from tortoise.models.vocoder import UnivNetGenerator +from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel +from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule +from tortoise.utils.tokenizer import VoiceBpeTokenizer +from tortoise.utils.wav2vec_alignment import Wav2VecAlignment + +pbar = None + + +def download_models(specific_models=None): + """ + Call to download all the models that Tortoise uses. + """ + MODELS = { + 'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/autoregressive.pth', + 'classifier.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/classifier.pth', + 'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/clvp.pth', + 'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/cvvp.pth', + 'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/diffusion_decoder.pth', + 'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/vocoder.pth', + 'rlg_auto.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/rlg_auto.pth', + 'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/hf/.models/rlg_diffuser.pth', + } + os.makedirs('.models', exist_ok=True) + def show_progress(block_num, block_size, total_size): + global pbar + if pbar is None: + pbar = progressbar.ProgressBar(maxval=total_size) + pbar.start() + + downloaded = block_num * block_size + if downloaded < total_size: + pbar.update(downloaded) + else: + pbar.finish() + pbar = None + for model_name, url in MODELS.items(): + if specific_models is not None and model_name not in specific_models: + continue + if os.path.exists(f'.models/{model_name}'): + continue + print(f'Downloading {model_name} from {url}...') + request.urlretrieve(url, f'.models/{model_name}', show_progress) + print('Done.') + + +def pad_or_truncate(t, length): + """ + Utility function for forcing to have the specified sequence length, whether by clipping it or padding it with 0s. + """ + if t.shape[-1] == length: + return t + elif t.shape[-1] < length: + return F.pad(t, (0, length-t.shape[-1])) + else: + return t[..., :length] + + +def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, cond_free_k=1): + """ + Helper function to load a GaussianDiffusion instance configured for use as a vocoder. + """ + return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), model_mean_type='epsilon', + model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps), + conditioning_free=cond_free, conditioning_free_k=cond_free_k) + + +def format_conditioning(clip, cond_length=132300): + """ + Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models. + """ + gap = clip.shape[-1] - cond_length + if gap < 0: + clip = F.pad(clip, pad=(0, abs(gap))) + elif gap > 0: + rand_start = random.randint(0, gap) + clip = clip[:, rand_start:rand_start + cond_length] + mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0) + return mel_clip.unsqueeze(0).cuda() + + +def fix_autoregressive_output(codes, stop_token, complain=True): + """ + This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was + trained on and what the autoregressive code generator creates (which has no padding or end). + This is highly specific to the DVAE being used, so this particular coding will not necessarily work if used with + a different DVAE. This can be inferred by feeding a audio clip padded with lots of zeros on the end through the DVAE + and copying out the last few codes. + + Failing to do this padding will produce speech with a harsh end that sounds like "BLAH" or similar. + """ + # Strip off the autoregressive stop token and add padding. + stop_token_indices = (codes == stop_token).nonzero() + if len(stop_token_indices) == 0: + if complain: + print("No stop tokens found in one of the generated voice clips. This typically means the spoken audio is " + "too long. In some cases, the output will still be good, though. Listen to it and if it is missing words, " + "try breaking up your input text.") + return codes + else: + codes[stop_token_indices] = 83 + stm = stop_token_indices.min().item() + codes[stm:] = 83 + if stm - 3 < codes.shape[0]: + codes[-3] = 45 + codes[-2] = 45 + codes[-1] = 248 + + return codes + + +def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True): + """ + Uses the specified diffusion model to convert discrete codes into a spectrogram. + """ + with torch.no_grad(): + output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. + output_shape = (latents.shape[0], 100, output_seq_len) + precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False) + + noise = torch.randn(output_shape, device=latents.device) * temperature + mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise, + model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, + progress=verbose) + return denormalize_tacotron_mel(mel)[:,:,:output_seq_len] + + +def classify_audio_clip(clip): + """ + Returns whether or not Tortoises' classifier thinks the given clip came from Tortoise. + :param clip: torch tensor containing audio waveform data (get it from load_audio) + :return: True if the clip was classified as coming from Tortoise and false if it was classified as real. + """ + download_models(['classifier.pth']) + classifier = AudioMiniEncoderWithClassifierHead(2, spec_dim=1, embedding_dim=512, depth=5, downsample_factor=4, + resnet_blocks=2, attn_blocks=4, num_attn_heads=4, base_channels=32, + dropout=0, kernel_size=5, distribute_zero_label=False) + classifier.load_state_dict(torch.load('.models/classifier.pth', map_location=torch.device('cpu'))) + clip = clip.cpu().unsqueeze(0) + results = F.softmax(classifier(clip), dim=-1) + return results[0][0] + + +class TextToSpeech: + """ + Main entry point into Tortoise. + """ + + def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True): + """ + Constructor + :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing + GPU OOM errors. Larger numbers generates slightly faster. + :param models_dir: Where model weights are stored. This should only be specified if you are providing your own + models, otherwise use the defaults. + :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output + (but are still rendered by the model). This can be used for prompt engineering. + Default is true. + """ + self.autoregressive_batch_size = autoregressive_batch_size + self.enable_redaction = enable_redaction + if self.enable_redaction: + self.aligner = Wav2VecAlignment() + + self.tokenizer = VoiceBpeTokenizer() + download_models() + + if os.path.exists(f'{models_dir}/autoregressive.ptt'): + # Assume this is a traced directory. + self.autoregressive = torch.jit.load(f'{models_dir}/autoregressive.ptt') + self.diffusion = torch.jit.load(f'{models_dir}/diffusion_decoder.ptt') + else: + self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, + model_dim=1024, + heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, + train_solo_embeddings=False).cpu().eval() + self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth')) + + self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200, + in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16, + layer_drop=0, unconditioned_percentage=0).cpu().eval() + self.diffusion.load_state_dict(torch.load(f'{models_dir}/diffusion_decoder.pth')) + + self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12, + text_seq_len=350, text_heads=8, + num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430, + use_xformers=True).cpu().eval() + self.clvp.load_state_dict(torch.load(f'{models_dir}/clvp.pth')) + + self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0, + speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval() + self.cvvp.load_state_dict(torch.load(f'{models_dir}/cvvp.pth')) + + self.vocoder = UnivNetGenerator().cpu() + self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g']) + self.vocoder.eval(inference=True) + + # Random latent generators (RLGs) are loaded lazily. + self.rlg_auto = None + self.rlg_diffusion = None + + def get_conditioning_latents(self, voice_samples, return_mels=False): + """ + Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). + These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic + properties. + :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data. + """ + voice_samples = [v.to('cuda') for v in voice_samples] + + auto_conds = [] + if not isinstance(voice_samples, list): + voice_samples = [voice_samples] + for vs in voice_samples: + auto_conds.append(format_conditioning(vs)) + auto_conds = torch.stack(auto_conds, dim=1) + self.autoregressive = self.autoregressive.cuda() + auto_latent = self.autoregressive.get_conditioning(auto_conds) + self.autoregressive = self.autoregressive.cpu() + + diffusion_conds = [] + for sample in voice_samples: + # The diffuser operates at a sample rate of 24000 (except for the latent inputs) + sample = torchaudio.functional.resample(sample, 22050, 24000) + sample = pad_or_truncate(sample, 102400) + cond_mel = wav_to_univnet_mel(sample.to('cuda'), do_normalization=False) + diffusion_conds.append(cond_mel) + diffusion_conds = torch.stack(diffusion_conds, dim=1) + + self.diffusion = self.diffusion.cuda() + diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) + self.diffusion = self.diffusion.cpu() + + if return_mels: + return auto_latent, diffusion_latent, auto_conds, diffusion_conds + else: + return auto_latent, diffusion_latent + + def get_random_conditioning_latents(self): + # Lazy-load the RLG models. + if self.rlg_auto is None: + self.rlg_auto = RandomLatentConverter(1024).eval() + self.rlg_auto.load_state_dict(torch.load('.models/rlg_auto.pth', map_location=torch.device('cpu'))) + self.rlg_diffusion = RandomLatentConverter(2048).eval() + self.rlg_diffusion.load_state_dict(torch.load('.models/rlg_diffuser.pth', map_location=torch.device('cpu'))) + with torch.no_grad(): + return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0])) + + def tts_with_preset(self, text, preset='fast', **kwargs): + """ + Calls TTS with one of a set of preset generation parameters. Options: + 'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest). + 'fast': Decent quality speech at a decent inference rate. A good choice for mass inference. + 'standard': Very good quality. This is generally about as good as you are going to get. + 'high_quality': Use if you want the absolute best. This is not really worth the compute, though. + """ + # Use generally found best tuning knobs for generation. + kwargs.update({'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0, + 'top_p': .8, + 'cond_free_k': 2.0, 'diffusion_temperature': 1.0}) + # Presets are defined here. + presets = { + 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False}, + 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, + 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, + 'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, + } + kwargs.update(presets[preset]) + return self.tts(text, **kwargs) + + def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, + # autoregressive generation parameters follow + num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500, + # CLVP & CVVP parameters + clvp_cvvp_slider=.5, + # diffusion generation parameters follow + diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, + **hf_generate_kwargs): + """ + Produces an audio clip of the given text being spoken with the given reference voice. + :param text: Text to be spoken. + :param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data. + :param conditioning_latents: A tuple of (autoregressive_conditioning_latent, diffusion_conditioning_latent), which + can be provided in lieu of voice_samples. This is ignored unless voice_samples=None. + Conditioning latents can be retrieved via get_conditioning_latents(). + :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP and CVVP models) clips are returned. + :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true. + ~~AUTOREGRESSIVE KNOBS~~ + :param num_autoregressive_samples: Number of samples taken from the autoregressive model, all of which are filtered using CLVP+CVVP. + As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great". + :param temperature: The softmax temperature of the autoregressive model. + :param length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs. + :param repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence + of long silences or "uhhhhhhs", etc. + :param top_p: P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely" (aka boring) outputs. + :param max_mel_tokens: Restricts the output length. (0,600] integer. Each unit is 1/20 of a second. + :param typical_sampling: Turns typical sampling on or off. This sampling mode is discussed in this paper: https://arxiv.org/abs/2202.00666 + I was interested in the premise, but the results were not as good as I was hoping. This is off by default, but + could use some tuning. + :param typical_mass: The typical_mass parameter from the typical_sampling algorithm. + ~~CLVP-CVVP KNOBS~~ + :param clvp_cvvp_slider: Controls the influence of the CLVP and CVVP models in selecting the best output from the autoregressive model. + [0,1]. Values closer to 1 will cause Tortoise to emit clips that follow the text more. Values closer to + 0 will cause Tortoise to emit clips that more closely follow the reference clip (e.g. the voice sounds more + similar). + ~~DIFFUSION KNOBS~~ + :param diffusion_iterations: Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine + the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, + however. + :param cond_free: Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for + each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output + of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and + dramatically improves realism. + :param cond_free_k: Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. + As cond_free_k increases, the output becomes dominated by the conditioning-free signal. + Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k + :param diffusion_temperature: Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 + are the "mean" prediction of the diffusion network and will sound bland and smeared. + ~~OTHER STUFF~~ + :param hf_generate_kwargs: The huggingface Transformers generate API is used for the autoregressive transformer. + Extra keyword args fed to this function get forwarded directly to that API. Documentation + here: https://huggingface.co/docs/transformers/internal/generation_utils + :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length. + Sample rate is 24kHz. + """ + text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda() + text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. + assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.' + + auto_conds = None + if voice_samples is not None: + auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True) + elif conditioning_latents is not None: + auto_conditioning, diffusion_conditioning = conditioning_latents + else: + auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() + auto_conditioning = auto_conditioning.cuda() + diffusion_conditioning = diffusion_conditioning.cuda() + + diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) + + with torch.no_grad(): + samples = [] + num_batches = num_autoregressive_samples // self.autoregressive_batch_size + stop_mel_token = self.autoregressive.stop_mel_token + calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" + self.autoregressive = self.autoregressive.cuda() + if verbose: + print("Generating autoregressive samples..") + for b in tqdm(range(num_batches), disable=not verbose): + codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, + do_sample=True, + top_p=top_p, + temperature=temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + max_generate_length=max_mel_tokens, + **hf_generate_kwargs) + padding_needed = max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) + samples.append(codes) + self.autoregressive = self.autoregressive.cpu() + + clip_results = [] + self.clvp = self.clvp.cuda() + self.cvvp = self.cvvp.cuda() + if verbose: + print("Computing best candidates using CLVP and CVVP") + for batch in tqdm(samples, disable=not verbose): + for i in range(batch.shape[0]): + batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) + clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) + if auto_conds is not None: + cvvp_accumulator = 0 + for cl in range(auto_conds.shape[1]): + cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) + cvvp = cvvp_accumulator / auto_conds.shape[1] + clip_results.append(clvp * clvp_cvvp_slider + cvvp * (1-clvp_cvvp_slider)) + else: + clip_results.append(clvp) + clip_results = torch.cat(clip_results, dim=0) + samples = torch.cat(samples, dim=0) + best_results = samples[torch.topk(clip_results, k=k).indices] + self.clvp = self.clvp.cpu() + self.cvvp = self.cvvp.cpu() + del samples + + # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning + # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these + # results, but will increase memory usage. + self.autoregressive = self.autoregressive.cuda() + best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), + torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, + torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), + return_latent=True, clip_inputs=False) + self.autoregressive = self.autoregressive.cpu() + del auto_conditioning + + if verbose: + print("Transforming autoregressive outputs into audio..") + wav_candidates = [] + self.diffusion = self.diffusion.cuda() + self.vocoder = self.vocoder.cuda() + for b in range(best_results.shape[0]): + codes = best_results[b].unsqueeze(0) + latents = best_latents[b].unsqueeze(0) + + # Find the first occurrence of the "calm" token and trim the codes to that. + ctokens = 0 + for k in range(codes.shape[-1]): + if codes[0, k] == calm_token: + ctokens += 1 + else: + ctokens = 0 + if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech. + latents = latents[:, :k] + break + + mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, + temperature=diffusion_temperature, verbose=verbose) + wav = self.vocoder.inference(mel) + wav_candidates.append(wav.cpu()) + self.diffusion = self.diffusion.cpu() + self.vocoder = self.vocoder.cpu() + + def potentially_redact(clip, text): + if self.enable_redaction: + return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1) + return clip + wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates] + if len(wav_candidates) > 1: + return wav_candidates + return wav_candidates[0] + diff --git a/tortoise/data/mel_norms.pth b/tortoise/data/mel_norms.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed4d6e4f71fba223d920da25f1bbd0c8619433b5 --- /dev/null +++ b/tortoise/data/mel_norms.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f69422a8a8f344c4fca2f0c6b8d41d2151d6615b7321e48e6bb15ae949b119c +size 1067 diff --git a/tortoise/data/riding_hood.txt b/tortoise/data/riding_hood.txt new file mode 100644 index 0000000000000000000000000000000000000000..2987bef78f92ecb327fc0f754b7ab1211a18542b --- /dev/null +++ b/tortoise/data/riding_hood.txt @@ -0,0 +1,54 @@ +Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her. It suited the girl so extremely well that everybody called her Little Red Riding Hood. +One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter." + +Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. + +As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest. He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." + +"Does she live far off?" said the wolf + +"Oh I say," answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." + +"Well," said the wolf, "and I'll go and see her too. I'll go this way and go you that, and we shall see who will be there first." + +The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers. It was not long before the wolf arrived at the old woman's house. He knocked at the door: tap, tap. + +"Who's there?" + +"Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother." + +The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up." + +The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten. He then shut the door and got into the grandmother's bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. + +"Who's there?" + +Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you." + +The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." + +Little Red Riding Hood pulled the bobbin, and the door opened. + +The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." + +Little Red Riding Hood took off her clothes and got into bed. She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" + +"All the better to hug you with, my dear." + +"Grandmother, what big legs you have!" + +"All the better to run with, my child." + +"Grandmother, what big ears you have!" + +"All the better to hear with, my child." + +"Grandmother, what big eyes you have!" + +"All the better to see with, my child." + +"Grandmother, what big teeth you have got!" + +"All the better to eat you up with." + +And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up. \ No newline at end of file diff --git a/tortoise/data/seal_copypasta.txt b/tortoise/data/seal_copypasta.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce59a386070125650d3c6d8e8a13801d3666aa5f --- /dev/null +++ b/tortoise/data/seal_copypasta.txt @@ -0,0 +1 @@ +What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al kayda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire U S armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the U S A and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo. \ No newline at end of file diff --git a/tortoise/data/tokenizer.json b/tortoise/data/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a128f273053e465a15c488e48d8106e0c8b0898e --- /dev/null +++ b/tortoise/data/tokenizer.json @@ -0,0 +1 @@ +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} \ No newline at end of file diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..77df67d452b36413c1b2e4082758db256f4d5407 --- /dev/null +++ b/tortoise/do_tts.py @@ -0,0 +1,32 @@ +import argparse +import os + +import torchaudio + +from api import TextToSpeech +from tortoise.utils.audio import load_audio, get_voices, load_voice + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--text', type=str, help='Text to speak.', default="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.") + parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' + 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random') + parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast') + parser.add_argument('--voice_diversity_intelligibility_slider', type=float, + help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility', + default=.5) + parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/') + parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this' + 'should only be specified if you have custom checkpoints.', default='.models') + args = parser.parse_args() + os.makedirs(args.output_path, exist_ok=True) + + tts = TextToSpeech(models_dir=args.model_dir) + + selected_voices = args.voice.split(',') + for k, voice in enumerate(selected_voices): + voice_samples, conditioning_latents = load_voice(voice) + gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, + preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider) + torchaudio.save(os.path.join(args.output_path, f'{voice}_{k}.wav'), gen.squeeze(0).cpu(), 24000) + diff --git a/tortoise/get_conditioning_latents.py b/tortoise/get_conditioning_latents.py new file mode 100644 index 0000000000000000000000000000000000000000..aa7e9b7dde64e4867cfdad025d739ca7fbff425f --- /dev/null +++ b/tortoise/get_conditioning_latents.py @@ -0,0 +1,30 @@ +import argparse +import os +import torch + +from api import TextToSpeech +from tortoise.utils.audio import load_audio, get_voices + +""" +Dumps the conditioning latents for the specified voice to disk. These are expressive latents which can be used for +other ML models, or can be augmented manually and fed back into Tortoise to affect vocal qualities. +""" +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat2') + parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/conditioning_latents') + args = parser.parse_args() + os.makedirs(args.output_path, exist_ok=True) + + tts = TextToSpeech() + voices = get_voices() + selected_voices = args.voice.split(',') + for voice in selected_voices: + cond_paths = voices[voice] + conds = [] + for cond_path in cond_paths: + c = load_audio(cond_path, 22050) + conds.append(c) + conditioning_latents = tts.get_conditioning_latents(conds) + torch.save(conditioning_latents, os.path.join(args.output_path, f'{voice}.pth')) + diff --git a/tortoise/is_this_from_tortoise.py b/tortoise/is_this_from_tortoise.py new file mode 100644 index 0000000000000000000000000000000000000000..289844f499fb45694bfb61f395867b81155daf8b --- /dev/null +++ b/tortoise/is_this_from_tortoise.py @@ -0,0 +1,14 @@ +import argparse + +from api import classify_audio_clip +from tortoise.utils.audio import load_audio + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--clip', type=str, help='Path to an audio clip to classify.', default="../examples/favorite_riding_hood.mp3") + args = parser.parse_args() + + clip = load_audio(args.clip, 24000) + clip = clip[:, :220000] + prob = classify_audio_clip(clip) + print(f"This classifier thinks there is a {prob*100}% chance that this clip was generated from Tortoise.") \ No newline at end of file diff --git a/tortoise/models/__init__.py b/tortoise/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tortoise/models/arch_util.py b/tortoise/models/arch_util.py new file mode 100644 index 0000000000000000000000000000000000000000..5d8c36e9d4dfeabb82c46cdbc083bbf12fb8f757 --- /dev/null +++ b/tortoise/models/arch_util.py @@ -0,0 +1,367 @@ +import functools +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio +from tortoise.models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def normalization(channels): + """ + Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + groups = 32 + if channels <= 16: + groups = 8 + elif channels <= 64: + groups = 16 + while channels % groups != 0: + groups = int(groups / 2) + assert groups > 2 + return GroupNorm32(groups, channels) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv, mask=None, rel_pos=None): + """ + Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = torch.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + if rel_pos is not None: + weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1]) + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + if mask is not None: + # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs. + mask = mask.repeat(self.n_heads, 1).unsqueeze(1) + weight = weight * mask + a = torch.einsum("bts,bcs->bct", weight, v) + + return a.reshape(bs, -1, length) + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + do_checkpoint=True, + relative_pos_embeddings=False, + ): + super().__init__() + self.channels = channels + self.do_checkpoint = do_checkpoint + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.norm = normalization(channels) + self.qkv = nn.Conv1d(channels, channels * 3, 1) + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) + if relative_pos_embeddings: + self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) + else: + self.relative_pos_embeddings = None + + def forward(self, x, mask=None): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv, mask, self.relative_pos_embeddings) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + """ + + def __init__(self, channels, use_conv, out_channels=None, factor=4): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.factor = factor + if use_conv: + ksize = 5 + pad = 2 + self.conv = nn.Conv1d(self.channels, self.out_channels, ksize, padding=pad) + + def forward(self, x): + assert x.shape[1] == self.channels + x = F.interpolate(x, scale_factor=self.factor, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + """ + + def __init__(self, channels, use_conv, out_channels=None, factor=4, ksize=5, pad=2): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + + stride = factor + if use_conv: + self.op = nn.Conv1d( + self.channels, self.out_channels, ksize, stride=stride, padding=pad + ) + else: + assert self.channels == self.out_channels + self.op = nn.AvgPool1d(kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(nn.Module): + def __init__( + self, + channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + up=False, + down=False, + kernel_size=3, + ): + super().__init__() + self.channels = channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_scale_shift_norm = use_scale_shift_norm + padding = 1 if kernel_size == 3 else 2 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False) + self.x_upd = Upsample(channels, False) + elif down: + self.h_upd = Downsample(channels, False) + self.x_upd = Downsample(channels, False) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = nn.Conv1d( + channels, self.out_channels, kernel_size, padding=padding + ) + else: + self.skip_connection = nn.Conv1d(channels, self.out_channels, 1) + + def forward(self, x): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AudioMiniEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + base_channels=128, + depth=2, + resnet_blocks=2, + attn_blocks=4, + num_attn_heads=4, + dropout=0, + downsample_factor=2, + kernel_size=3): + super().__init__() + self.init = nn.Sequential( + nn.Conv1d(spec_dim, base_channels, 3, padding=1) + ) + ch = base_channels + res = [] + for l in range(depth): + for r in range(resnet_blocks): + res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) + res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) + ch *= 2 + self.res = nn.Sequential(*res) + self.final = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.Conv1d(ch, embedding_dim, 1) + ) + attn = [] + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads,)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + + def forward(self, x): + h = self.init(x) + h = self.res(h) + h = self.final(h) + h = self.attn(h) + return h[:, :, 0] + + +class TorchMelSpectrogram(nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, mel_fmin=0, mel_fmax=8000, + sampling_rate=22050, normalize=False, mel_norm_file='tortoise/data/mel_norms.pth'): + super().__init__() + # These are the default tacotron values for the MEL spectrogram. + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.n_mel_channels = n_mel_channels + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.sampling_rate = sampling_rate + self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length, + win_length=self.win_length, power=2, normalized=normalize, + sample_rate=self.sampling_rate, f_min=self.mel_fmin, + f_max=self.mel_fmax, n_mels=self.n_mel_channels, + norm="slaney") + self.mel_norm_file = mel_norm_file + if self.mel_norm_file is not None: + self.mel_norms = torch.load(self.mel_norm_file) + else: + self.mel_norms = None + + def forward(self, inp): + if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) + inp = inp.squeeze(1) + assert len(inp.shape) == 2 + self.mel_stft = self.mel_stft.to(inp.device) + mel = self.mel_stft(inp) + # Perform dynamic range compression + mel = torch.log(torch.clamp(mel, min=1e-5)) + if self.mel_norms is not None: + self.mel_norms = self.mel_norms.to(mel.device) + mel = mel / self.mel_norms.unsqueeze(0).unsqueeze(-1) + return mel + + +class CheckpointedLayer(nn.Module): + """ + Wraps a module. When forward() is called, passes kwargs that require_grad through torch.checkpoint() and bypasses + checkpoint for all other args. + """ + def __init__(self, wrap): + super().__init__() + self.wrap = wrap + + def forward(self, x, *args, **kwargs): + for k, v in kwargs.items(): + assert not (isinstance(v, torch.Tensor) and v.requires_grad) # This would screw up checkpointing. + partial = functools.partial(self.wrap, **kwargs) + return torch.utils.checkpoint.checkpoint(partial, x, *args) + + +class CheckpointedXTransformerEncoder(nn.Module): + """ + Wraps a ContinuousTransformerWrapper and applies CheckpointedLayer to each layer and permutes from channels-mid + to channels-last that XTransformer expects. + """ + def __init__(self, needs_permute=True, exit_permute=True, checkpoint=True, **xtransformer_kwargs): + super().__init__() + self.transformer = ContinuousTransformerWrapper(**xtransformer_kwargs) + self.needs_permute = needs_permute + self.exit_permute = exit_permute + + if not checkpoint: + return + for i in range(len(self.transformer.attn_layers.layers)): + n, b, r = self.transformer.attn_layers.layers[i] + self.transformer.attn_layers.layers[i] = nn.ModuleList([n, CheckpointedLayer(b), r]) + + def forward(self, x, **kwargs): + if self.needs_permute: + x = x.permute(0,2,1) + h = self.transformer(x, **kwargs) + if self.exit_permute: + h = h.permute(0,2,1) + return h \ No newline at end of file diff --git a/tortoise/models/autoregressive.py b/tortoise/models/autoregressive.py new file mode 100644 index 0000000000000000000000000000000000000000..757a7a8555b3bbc1ca0cff9c38cf0d8699c0c4b7 --- /dev/null +++ b/tortoise/models/autoregressive.py @@ -0,0 +1,511 @@ +import functools + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList +from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions +from transformers.utils.model_parallel_utils import get_device_map, assert_device_map +from tortoise.models.arch_util import AttentionBlock +from tortoise.utils.typical_sampling import TypicalLogitsWarper + + +def null_position_embeddings(range, dim): + return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device) + + +class ResBlock(nn.Module): + """ + Basic residual convolutional block that uses GroupNorm. + """ + def __init__(self, chan): + super().__init__() + self.net = nn.Sequential( + nn.Conv1d(chan, chan, kernel_size=3, padding=1), + nn.GroupNorm(chan//8, chan), + nn.ReLU(), + nn.Conv1d(chan, chan, kernel_size=3, padding=1), + nn.GroupNorm(chan//8, chan) + ) + + def forward(self, x): + return F.relu(self.net(x) + x) + + +class GPT2InferenceModel(GPT2PreTrainedModel): + def __init__(self, config, gpt, text_pos_emb, embeddings, norm, linear): + super().__init__(config) + self.transformer = gpt + self.text_pos_embedding = text_pos_emb + self.embeddings = embeddings + self.lm_head = nn.Sequential(norm, linear) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.cached_mel_emb = None + + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.model_parallel = True + + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def store_mel_emb(self, mel_emb): + self.cached_mel_emb = mel_emb + + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + return { + "input_ids": input_ids, + "past_key_values": past, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + assert self.cached_mel_emb is not None + assert inputs_embeds is None # Not supported by this inference model. + assert labels is None # Training not supported by this inference model. + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Create embedding + mel_len = self.cached_mel_emb.shape[1] + if input_ids.shape[1] != 1: + text_inputs = input_ids[:, mel_len:] + text_emb = self.embeddings(text_inputs) + text_emb = text_emb + self.text_pos_embedding(text_emb) + if self.cached_mel_emb.shape[0] != text_emb.shape[0]: + mel_emb = self.cached_mel_emb.repeat_interleave(text_emb.shape[0]//self.cached_mel_emb.shape[0], 0) + else: + mel_emb = self.cached_mel_emb + emb = torch.cat([mel_emb, text_emb], dim=1) + else: + emb = self.embeddings(input_ids) + emb = emb + self.text_pos_embedding.get_fixed_embedding(attention_mask.shape[1]-mel_len, attention_mask.device) + + transformer_outputs = self.transformer( + inputs_embeds=emb, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + transformer_outputs[1:] + + return CausalLMOutputWithCrossAttentions( + loss=None, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past, beam_idx): + """ + This function is used to re-order the :obj:`past_key_values` cache if + :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is + called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +class ConditioningEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + attn_blocks=6, + num_attn_heads=4, + do_checkpointing=False, + mean=False): + super().__init__() + attn = [] + self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1) + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + self.do_checkpointing = do_checkpointing + self.mean = mean + + def forward(self, x): + h = self.init(x) + h = self.attn(h) + if self.mean: + return h.mean(dim=2) + else: + return h[:, :, 0] + + +class LearnedPositionEmbeddings(nn.Module): + def __init__(self, seq_len, model_dim, init=.02): + super().__init__() + self.emb = nn.Embedding(seq_len, model_dim) + # Initializing this way is standard for GPT-2 + self.emb.weight.data.normal_(mean=0.0, std=init) + + def forward(self, x): + sl = x.shape[1] + return self.emb(torch.arange(0, sl, device=x.device)) + + def get_fixed_embedding(self, ind, dev): + return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) + + +def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing): + """ + GPT-2 implemented by the HuggingFace library. + """ + from transformers import GPT2Config, GPT2Model + gpt_config = GPT2Config(vocab_size=256, # Unused. + n_positions=max_mel_seq_len+max_text_seq_len, + n_ctx=max_mel_seq_len+max_text_seq_len, + n_embd=model_dim, + n_layer=layers, + n_head=heads, + gradient_checkpointing=checkpointing, + use_cache=not checkpointing) + gpt = GPT2Model(gpt_config) + # Override the built in positional embeddings + del gpt.wpe + gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) + # Built-in token embeddings are unused. + del gpt.wte + return gpt, LearnedPositionEmbeddings(max_mel_seq_len, model_dim), LearnedPositionEmbeddings(max_text_seq_len, model_dim),\ + None, None + + +class MelEncoder(nn.Module): + def __init__(self, channels, mel_channels=80, resblocks_per_reduction=2): + super().__init__() + self.channels = channels + self.encoder = nn.Sequential(nn.Conv1d(mel_channels, channels//4, kernel_size=3, padding=1), + nn.Sequential(*[ResBlock(channels//4) for _ in range(resblocks_per_reduction)]), + nn.Conv1d(channels//4, channels//2, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(channels//16, channels//2), + nn.ReLU(), + nn.Sequential(*[ResBlock(channels//2) for _ in range(resblocks_per_reduction)]), + nn.Conv1d(channels//2, channels, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(channels//8, channels), + nn.ReLU(), + nn.Sequential(*[ResBlock(channels) for _ in range(resblocks_per_reduction)]), + ) + self.reduction = 4 + + + def forward(self, x): + for e in self.encoder: + x = e(x) + return x.permute(0,2,1) + + +class UnifiedVoice(nn.Module): + def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, max_conditioning_inputs=1, + mel_length_compression=1024, number_text_tokens=256, + start_text_token=None, number_mel_codes=8194, start_mel_token=8192, + stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True, + checkpointing=True, types=1): + """ + Args: + layers: Number of layers in transformer stack. + model_dim: Operating dimensions of the transformer + heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64 + max_text_tokens: Maximum number of text tokens that will be encountered by model. + max_mel_tokens: Maximum number of MEL tokens that will be encountered by model. + max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s). + mel_length_compression: The factor between and . Used to compute MEL code padding given wav input length. + number_text_tokens: + start_text_token: + stop_text_token: + number_mel_codes: + start_mel_token: + stop_mel_token: + train_solo_embeddings: + use_mel_codes_as_input: + checkpointing: + """ + super().__init__() + + self.number_text_tokens = number_text_tokens + self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token + self.stop_text_token = 0 + self.number_mel_codes = number_mel_codes + self.start_mel_token = start_mel_token + self.stop_mel_token = stop_mel_token + self.layers = layers + self.heads = heads + self.max_mel_tokens = max_mel_tokens + self.max_text_tokens = max_text_tokens + self.model_dim = model_dim + self.max_conditioning_inputs = max_conditioning_inputs + self.mel_length_compression = mel_length_compression + self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) + self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim) + if use_mel_codes_as_input: + self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim) + else: + self.mel_embedding = MelEncoder(model_dim, resblocks_per_reduction=1) + self.gpt, self.mel_pos_embedding, self.text_pos_embedding, self.mel_layer_pos_embedding, self.text_layer_pos_embedding = \ + build_hf_gpt_transformer(layers, model_dim, heads, self.max_mel_tokens+2+self.max_conditioning_inputs, self.max_text_tokens+2, checkpointing) + if train_solo_embeddings: + self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True) + self.text_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True) + else: + self.mel_solo_embedding = 0 + self.text_solo_embedding = 0 + + self.final_norm = nn.LayerNorm(model_dim) + self.text_head = nn.Linear(model_dim, self.number_text_tokens*types+1) + self.mel_head = nn.Linear(model_dim, self.number_mel_codes) + + # Initialize the embeddings per the GPT-2 scheme + embeddings = [self.text_embedding] + if use_mel_codes_as_input: + embeddings.append(self.mel_embedding) + for module in embeddings: + module.weight.data.normal_(mean=0.0, std=.02) + + def build_aligned_inputs_and_targets(self, input, start_token, stop_token): + inp = F.pad(input, (1,0), value=start_token) + tar = F.pad(input, (0,1), value=stop_token) + return inp, tar + + def set_mel_padding(self, mel_input_tokens, wav_lengths): + """ + Given mel tokens that are derived from a padded audio clip and the actual lengths of each batch element in + that audio clip, reformats the tokens with STOP_MEL_TOKEN in place of the zero padding. This is required + preformatting to create a working TTS model. + """ + # Set padding areas within MEL (currently it is coded with the MEL code for ). + mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode='trunc') + for b in range(len(mel_lengths)): + actual_end = mel_lengths[b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token. + if actual_end < mel_input_tokens.shape[-1]: + mel_input_tokens[b, actual_end:] = self.stop_mel_token + return mel_input_tokens + + def get_logits(self, speech_conditioning_inputs, first_inputs, first_head, second_inputs=None, second_head=None, get_attns=False, return_latent=False): + if second_inputs is not None: + emb = torch.cat([speech_conditioning_inputs, first_inputs, second_inputs], dim=1) + else: + emb = torch.cat([speech_conditioning_inputs, first_inputs], dim=1) + + gpt_out = self.gpt(inputs_embeds=emb, return_dict=True, output_attentions=get_attns) + if get_attns: + return gpt_out.attentions + + enc = gpt_out.last_hidden_state[:, 1:] # The first logit is tied to the speech_conditioning_input + enc = self.final_norm(enc) + + if return_latent: + return enc[:, speech_conditioning_inputs.shape[1]:speech_conditioning_inputs.shape[1]+first_inputs.shape[1]], enc[:, -second_inputs.shape[1]:] + + first_logits = enc[:, :first_inputs.shape[1]] + first_logits = first_head(first_logits) + first_logits = first_logits.permute(0,2,1) + if second_inputs is not None: + second_logits = enc[:, -second_inputs.shape[1]:] + second_logits = second_head(second_logits) + second_logits = second_logits.permute(0,2,1) + return first_logits, second_logits + else: + return first_logits + + def get_conditioning(self, speech_conditioning_input): + speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len( + speech_conditioning_input.shape) == 3 else speech_conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds = torch.stack(conds, dim=1) + conds = conds.mean(dim=1) + return conds + + def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False, + return_latent=False, clip_inputs=True): + """ + Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode + (actuated by `text_first`). + + speech_conditioning_input: MEL float tensor, (b,1024) + text_inputs: long tensor, (b,t) + text_lengths: long tensor, (b,) + mel_inputs: long tensor, (b,m) + wav_lengths: long tensor, (b,) + raw_mels: MEL float tensor (b,80,s) + + If return_attentions is specified, only logits are returned. + If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned. + If clip_inputs is True, the inputs will be clipped to the smallest input size across each input modality. + """ + # Types are expressed by expanding the text embedding space. + if types is not None: + text_inputs = text_inputs * (1+types).unsqueeze(-1) + + if clip_inputs: + # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by + # chopping the inputs by the maximum actual length. + max_text_len = text_lengths.max() + text_inputs = text_inputs[:, :max_text_len] + max_mel_len = wav_lengths.max() // self.mel_length_compression + mel_codes = mel_codes[:, :max_mel_len] + if raw_mels is not None: + raw_mels = raw_mels[:, :, :max_mel_len*4] + mel_codes = self.set_mel_padding(mel_codes, wav_lengths) + text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token) + mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token) + + conds = speech_conditioning_latent.unsqueeze(1) + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token) + if raw_mels is not None: + mel_inp = F.pad(raw_mels, (0, 8)) + else: + mel_inp = mel_codes + mel_emb = self.mel_embedding(mel_inp) + mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + + if text_first: + text_logits, mel_logits = self.get_logits(conds, text_emb, self.text_head, mel_emb, self.mel_head, get_attns=return_attentions, return_latent=return_latent) + if return_latent: + return mel_logits[:, :-2] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass. + else: + mel_logits, text_logits = self.get_logits(conds, mel_emb, self.mel_head, text_emb, self.text_head, get_attns=return_attentions, return_latent=return_latent) + if return_latent: + return text_logits[:, :-2] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass. + + if return_attentions: + return mel_logits + loss_text = F.cross_entropy(text_logits, text_targets.long()) + loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) + return loss_text.mean(), loss_mel.mean(), mel_logits + + def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1, + max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs): + seq_length = self.max_mel_tokens + self.max_text_tokens + 2 + if not hasattr(self, 'inference_model'): + # TODO: Decouple gpt_config from this inference model. + gpt_config = GPT2Config(vocab_size=self.max_mel_tokens, + n_positions=seq_length, + n_ctx=seq_length, + n_embd=self.model_dim, + n_layer=self.layers, + n_head=self.heads, + gradient_checkpointing=False, + use_cache=True) + self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head) + self.gpt.wte = self.mel_embedding + + text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token) + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + + conds = speech_conditioning_latent.unsqueeze(1) + emb = torch.cat([conds, text_emb], dim=1) + self.inference_model.store_mel_emb(emb) + + fake_inputs = torch.full((emb.shape[0], conds.shape[1] + emb.shape[1],), fill_value=1, dtype=torch.long, + device=text_inputs.device) + fake_inputs[:, -1] = self.start_mel_token + trunc_index = fake_inputs.shape[1] + if input_tokens is None: + inputs = fake_inputs + else: + assert num_return_sequences % input_tokens.shape[0] == 0, "The number of return sequences must be divisible by the number of input sequences" + fake_inputs = fake_inputs.repeat(num_return_sequences, 1) + input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) + inputs = torch.cat([fake_inputs, input_tokens], dim=1) + + logits_processor = LogitsProcessorList([TypicalLogitsWarper(mass=typical_mass)]) if typical_sampling else LogitsProcessorList() + max_length = trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length + gen = self.inference_model.generate(inputs, bos_token_id=self.start_mel_token, pad_token_id=self.stop_mel_token, eos_token_id=self.stop_mel_token, + max_length=max_length, logits_processor=logits_processor, + num_return_sequences=num_return_sequences, **hf_generate_kwargs) + return gen[:, trunc_index:] + + +if __name__ == '__main__': + gpt = UnifiedVoice(model_dim=256, heads=4, train_solo_embeddings=True, use_mel_codes_as_input=True, max_conditioning_inputs=4) + l = gpt(torch.randn(2, 3, 80, 800), + torch.randint(high=120, size=(2,120)), + torch.tensor([32, 120]), + torch.randint(high=8192, size=(2,250)), + torch.tensor([250*256,195*256])) + gpt.text_forward(torch.randn(2,80,800), torch.randint(high=50, size=(2,80)), torch.tensor([32, 80])) diff --git a/tortoise/models/classifier.py b/tortoise/models/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..ce574eabb38b36b832ed27e61c836caf4d626185 --- /dev/null +++ b/tortoise/models/classifier.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from tortoise.models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock + + +class ResBlock(nn.Module): + def __init__( + self, + channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + up=False, + down=False, + kernel_size=3, + do_checkpoint=True, + ): + super().__init__() + self.channels = channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_scale_shift_norm = use_scale_shift_norm + self.do_checkpoint = do_checkpoint + padding = 1 if kernel_size == 3 else 2 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = nn.Conv1d( + dims, channels, self.out_channels, kernel_size, padding=padding + ) + else: + self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, 1) + + def forward(self, x): + if self.do_checkpoint: + return checkpoint( + self._forward, x + ) + else: + return self._forward(x) + + def _forward(self, x): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AudioMiniEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + base_channels=128, + depth=2, + resnet_blocks=2, + attn_blocks=4, + num_attn_heads=4, + dropout=0, + downsample_factor=2, + kernel_size=3): + super().__init__() + self.init = nn.Sequential( + nn.Conv1d(spec_dim, base_channels, 3, padding=1) + ) + ch = base_channels + res = [] + self.layers = depth + for l in range(depth): + for r in range(resnet_blocks): + res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)) + res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) + ch *= 2 + self.res = nn.Sequential(*res) + self.final = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.Conv1d(ch, embedding_dim, 1) + ) + attn = [] + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + + def forward(self, x): + h = self.init(x) + h = self.res(h) + h = self.final(h) + for blk in self.attn: + h = checkpoint(blk, h) + return h[:, :, 0] + + +class AudioMiniEncoderWithClassifierHead(nn.Module): + def __init__(self, classes, distribute_zero_label=True, **kwargs): + super().__init__() + self.enc = AudioMiniEncoder(**kwargs) + self.head = nn.Linear(self.enc.dim, classes) + self.num_classes = classes + self.distribute_zero_label = distribute_zero_label + + def forward(self, x, labels=None): + h = self.enc(x) + logits = self.head(h) + if labels is None: + return logits + else: + if self.distribute_zero_label: + oh_labels = nn.functional.one_hot(labels, num_classes=self.num_classes) + zeros_indices = (labels == 0).unsqueeze(-1) + # Distribute 20% of the probability mass on all classes when zero is specified, to compensate for dataset noise. + zero_extra_mass = torch.full_like(oh_labels, dtype=torch.float, fill_value=.2/(self.num_classes-1)) + zero_extra_mass[:, 0] = -.2 + zero_extra_mass = zero_extra_mass * zeros_indices + oh_labels = oh_labels + zero_extra_mass + else: + oh_labels = labels + loss = nn.functional.cross_entropy(logits, oh_labels) + return loss diff --git a/tortoise/models/clvp.py b/tortoise/models/clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..00f5011a053f28b53a363bcd696e6267c8924c3b --- /dev/null +++ b/tortoise/models/clvp.py @@ -0,0 +1,155 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import einsum + +from tortoise.models.arch_util import CheckpointedXTransformerEncoder +from tortoise.models.transformer import Transformer +from tortoise.models.xtransformers import Encoder + + +def exists(val): + return val is not None + + +def masked_mean(t, mask, dim = 1): + t = t.masked_fill(~mask[:, :, None], 0.) + return t.sum(dim = 1) / mask.sum(dim = 1)[..., None] + +class CLVP(nn.Module): + """ + CLIP model retrofitted for performing contrastive evaluation between tokenized audio data and the corresponding + transcribed text. + + Originally from https://github.com/lucidrains/DALLE-pytorch/blob/main/dalle_pytorch/dalle_pytorch.py + """ + + def __init__( + self, + *, + dim_text=512, + dim_speech=512, + dim_latent=512, + num_text_tokens=256, + text_enc_depth=6, + text_seq_len=120, + text_heads=8, + num_speech_tokens=8192, + speech_enc_depth=6, + speech_heads=8, + speech_seq_len=250, + text_mask_percentage=0, + voice_mask_percentage=0, + wav_token_compression=1024, + use_xformers=False, + ): + super().__init__() + self.text_emb = nn.Embedding(num_text_tokens, dim_text) + self.to_text_latent = nn.Linear(dim_text, dim_latent, bias=False) + + self.speech_emb = nn.Embedding(num_speech_tokens, dim_speech) + self.to_speech_latent = nn.Linear(dim_speech, dim_latent, bias=False) + + if use_xformers: + self.text_transformer = CheckpointedXTransformerEncoder( + needs_permute=False, + exit_permute=False, + max_seq_len=-1, + attn_layers=Encoder( + dim=dim_text, + depth=text_enc_depth, + heads=text_heads, + ff_dropout=.1, + ff_mult=2, + attn_dropout=.1, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + )) + self.speech_transformer = CheckpointedXTransformerEncoder( + needs_permute=False, + exit_permute=False, + max_seq_len=-1, + attn_layers=Encoder( + dim=dim_speech, + depth=speech_enc_depth, + heads=speech_heads, + ff_dropout=.1, + ff_mult=2, + attn_dropout=.1, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + )) + else: + self.text_transformer = Transformer(causal=False, seq_len=text_seq_len, dim=dim_text, depth=text_enc_depth, + heads=text_heads) + self.speech_transformer = Transformer(causal=False, seq_len=speech_seq_len, dim=dim_speech, + depth=speech_enc_depth, heads=speech_heads) + + self.temperature = nn.Parameter(torch.tensor(1.)) + self.text_mask_percentage = text_mask_percentage + self.voice_mask_percentage = voice_mask_percentage + self.wav_token_compression = wav_token_compression + self.xformers = use_xformers + if not use_xformers: + self.text_pos_emb = nn.Embedding(text_seq_len, dim_text) + self.speech_pos_emb = nn.Embedding(num_speech_tokens, dim_speech) + + def forward( + self, + text, + speech_tokens, + return_loss=False + ): + b, device = text.shape[0], text.device + if self.training: + text_mask = torch.rand_like(text.float()) > self.text_mask_percentage + voice_mask = torch.rand_like(speech_tokens.float()) > self.voice_mask_percentage + else: + text_mask = torch.ones_like(text.float()).bool() + voice_mask = torch.ones_like(speech_tokens.float()).bool() + + text_emb = self.text_emb(text) + speech_emb = self.speech_emb(speech_tokens) + + if not self.xformers: + text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device)) + speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device)) + + enc_text = self.text_transformer(text_emb, mask=text_mask) + enc_speech = self.speech_transformer(speech_emb, mask=voice_mask) + + text_latents = masked_mean(enc_text, text_mask, dim=1) + speech_latents = masked_mean(enc_speech, voice_mask, dim=1) + + text_latents = self.to_text_latent(text_latents) + speech_latents = self.to_speech_latent(speech_latents) + + text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents)) + + temp = self.temperature.exp() + + if not return_loss: + sim = einsum('n d, n d -> n', text_latents, speech_latents) * temp + return sim + + sim = einsum('i d, j d -> i j', text_latents, speech_latents) * temp + labels = torch.arange(b, device=device) + loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 + return loss + + +if __name__ == '__main__': + clip = CLVP(text_mask_percentage=.2, voice_mask_percentage=.2) + clip(torch.randint(0,256,(2,120)), + torch.tensor([50,100]), + torch.randint(0,8192,(2,250)), + torch.tensor([101,102]), + return_loss=True) + nonloss = clip(torch.randint(0,256,(2,120)), + torch.tensor([50,100]), + torch.randint(0,8192,(2,250)), + torch.tensor([101,102]), + return_loss=False) + print(nonloss.shape) \ No newline at end of file diff --git a/tortoise/models/cvvp.py b/tortoise/models/cvvp.py new file mode 100644 index 0000000000000000000000000000000000000000..d094649f3fb3386ec7c78da3d9ead34eebea4968 --- /dev/null +++ b/tortoise/models/cvvp.py @@ -0,0 +1,133 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import einsum +from torch.utils.checkpoint import checkpoint + +from tortoise.models.arch_util import AttentionBlock +from tortoise.models.xtransformers import ContinuousTransformerWrapper, Encoder + + +def exists(val): + return val is not None + + +def masked_mean(t, mask): + t = t.masked_fill(~mask, 0.) + return t.sum(dim = 1) / mask.sum(dim = 1) + + +class CollapsingTransformer(nn.Module): + def __init__(self, model_dim, output_dims, heads, dropout, depth, mask_percentage=0, **encoder_kwargs): + super().__init__() + self.transformer = ContinuousTransformerWrapper( + max_seq_len=-1, + use_pos_emb=False, + attn_layers=Encoder( + dim=model_dim, + depth=depth, + heads=heads, + ff_dropout=dropout, + ff_mult=1, + attn_dropout=dropout, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + **encoder_kwargs, + )) + self.pre_combiner = nn.Sequential(nn.Conv1d(model_dim, output_dims, 1), + AttentionBlock(output_dims, num_heads=heads, do_checkpoint=False), + nn.Conv1d(output_dims, output_dims, 1)) + self.mask_percentage = mask_percentage + + def forward(self, x, **transformer_kwargs): + h = self.transformer(x, **transformer_kwargs) + h = h.permute(0,2,1) + h = checkpoint(self.pre_combiner, h).permute(0,2,1) + if self.training: + mask = torch.rand_like(h.float()) > self.mask_percentage + else: + mask = torch.ones_like(h.float()).bool() + return masked_mean(h, mask) + + +class ConvFormatEmbedding(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.emb = nn.Embedding(*args, **kwargs) + + def forward(self, x): + y = self.emb(x) + return y.permute(0,2,1) + + +class CVVP(nn.Module): + def __init__( + self, + model_dim=512, + transformer_heads=8, + dropout=.1, + conditioning_enc_depth=8, + cond_mask_percentage=0, + mel_channels=80, + mel_codes=None, + speech_enc_depth=8, + speech_mask_percentage=0, + latent_multiplier=1, + ): + super().__init__() + latent_dim = latent_multiplier*model_dim + self.temperature = nn.Parameter(torch.tensor(1.)) + + self.cond_emb = nn.Sequential(nn.Conv1d(mel_channels, model_dim//2, kernel_size=5, stride=2, padding=2), + nn.Conv1d(model_dim//2, model_dim, kernel_size=3, stride=2, padding=1)) + self.conditioning_transformer = CollapsingTransformer(model_dim, model_dim, transformer_heads, dropout, conditioning_enc_depth, cond_mask_percentage) + self.to_conditioning_latent = nn.Linear(latent_dim, latent_dim, bias=False) + + if mel_codes is None: + self.speech_emb = nn.Conv1d(mel_channels, model_dim, kernel_size=5, padding=2) + else: + self.speech_emb = ConvFormatEmbedding(mel_codes, model_dim) + self.speech_transformer = CollapsingTransformer(model_dim, latent_dim, transformer_heads, dropout, speech_enc_depth, speech_mask_percentage) + self.to_speech_latent = nn.Linear(latent_dim, latent_dim, bias=False) + + def get_grad_norm_parameter_groups(self): + return { + 'conditioning': list(self.conditioning_transformer.parameters()), + 'speech': list(self.speech_transformer.parameters()), + } + + def forward( + self, + mel_cond, + mel_input, + return_loss=False + ): + cond_emb = self.cond_emb(mel_cond).permute(0,2,1) + enc_cond = self.conditioning_transformer(cond_emb) + cond_latents = self.to_conditioning_latent(enc_cond) + + speech_emb = self.speech_emb(mel_input).permute(0,2,1) + enc_speech = self.speech_transformer(speech_emb) + speech_latents = self.to_speech_latent(enc_speech) + + + cond_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (cond_latents, speech_latents)) + temp = self.temperature.exp() + + if not return_loss: + sim = einsum('n d, n d -> n', cond_latents, speech_latents) * temp + return sim + + sim = einsum('i d, j d -> i j', cond_latents, speech_latents) * temp + labels = torch.arange(cond_latents.shape[0], device=mel_input.device) + loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 + + return loss + + +if __name__ == '__main__': + clvp = CVVP() + clvp(torch.randn(2,80,100), + torch.randn(2,80,95), + return_loss=True) \ No newline at end of file diff --git a/tortoise/models/diffusion_decoder.py b/tortoise/models/diffusion_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f67d21a3903db8f44b704b38d2e9c804dc22d9a9 --- /dev/null +++ b/tortoise/models/diffusion_decoder.py @@ -0,0 +1,333 @@ +import math +import random +from abc import abstractmethod + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import autocast + +from tortoise.models.arch_util import normalization, AttentionBlock + + +def is_latent(t): + return t.dtype == torch.float + + +def is_sequence(t): + return t.dtype == torch.long + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +class TimestepBlock(nn.Module): + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + def forward(self, x, emb): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + else: + x = layer(x) + return x + + +class ResBlock(TimestepBlock): + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + dims=2, + kernel_size=3, + efficient_config=True, + use_scale_shift_norm=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_scale_shift_norm = use_scale_shift_norm + padding = {1: 0, 3: 1, 5: 2}[kernel_size] + eff_kernel = 1 if efficient_config else 3 + eff_padding = 0 if efficient_config else 1 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding), + ) + + self.emb_layers = nn.Sequential( + nn.SiLU(), + nn.Linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + else: + self.skip_connection = nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding) + + def forward(self, x, emb): + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = torch.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class DiffusionLayer(TimestepBlock): + def __init__(self, model_channels, dropout, num_heads): + super().__init__() + self.resblk = ResBlock(model_channels, model_channels, dropout, model_channels, dims=1, use_scale_shift_norm=True) + self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True) + + def forward(self, x, time_emb): + y = self.resblk(x, time_emb) + return self.attn(y) + + +class DiffusionTts(nn.Module): + def __init__( + self, + model_channels=512, + num_layers=8, + in_channels=100, + in_latent_channels=512, + in_tokens=8193, + out_channels=200, # mean and variance + dropout=0, + use_fp16=False, + num_heads=16, + # Parameters for regularization. + layer_drop=.1, + unconditioned_percentage=.1, # This implements a mechanism similar to what is used in classifier-free training. + ): + super().__init__() + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.dropout = dropout + self.num_heads = num_heads + self.unconditioned_percentage = unconditioned_percentage + self.enable_fp16 = use_fp16 + self.layer_drop = layer_drop + + self.inp_block = nn.Conv1d(in_channels, model_channels, 3, 1, 1) + self.time_embed = nn.Sequential( + nn.Linear(model_channels, model_channels), + nn.SiLU(), + nn.Linear(model_channels, model_channels), + ) + + # Either code_converter or latent_converter is used, depending on what type of conditioning data is fed. + # This model is meant to be able to be trained on both for efficiency purposes - it is far less computationally + # complex to generate tokens, while generating latents will normally mean propagating through a deep autoregressive + # transformer network. + self.code_embedding = nn.Embedding(in_tokens, model_channels) + self.code_converter = nn.Sequential( + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + ) + self.code_norm = normalization(model_channels) + self.latent_conditioner = nn.Sequential( + nn.Conv1d(in_latent_channels, model_channels, 3, padding=1), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + ) + self.contextual_embedder = nn.Sequential(nn.Conv1d(in_channels,model_channels,3,padding=1,stride=2), + nn.Conv1d(model_channels, model_channels*2,3,padding=1,stride=2), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels*2, num_heads, relative_pos_embeddings=True, do_checkpoint=False)) + self.unconditioned_embedding = nn.Parameter(torch.randn(1,model_channels,1)) + self.conditioning_timestep_integrator = TimestepEmbedSequential( + DiffusionLayer(model_channels, dropout, num_heads), + DiffusionLayer(model_channels, dropout, num_heads), + DiffusionLayer(model_channels, dropout, num_heads), + ) + + self.integrating_conv = nn.Conv1d(model_channels*2, model_channels, kernel_size=1) + self.mel_head = nn.Conv1d(model_channels, in_channels, kernel_size=3, padding=1) + + self.layers = nn.ModuleList([DiffusionLayer(model_channels, dropout, num_heads) for _ in range(num_layers)] + + [ResBlock(model_channels, model_channels, dropout, dims=1, use_scale_shift_norm=True) for _ in range(3)]) + + self.out = nn.Sequential( + normalization(model_channels), + nn.SiLU(), + nn.Conv1d(model_channels, out_channels, 3, padding=1), + ) + + def get_grad_norm_parameter_groups(self): + groups = { + 'minicoder': list(self.contextual_embedder.parameters()), + 'layers': list(self.layers.parameters()), + 'code_converters': list(self.code_embedding.parameters()) + list(self.code_converter.parameters()) + list(self.latent_conditioner.parameters()) + list(self.latent_conditioner.parameters()), + 'timestep_integrator': list(self.conditioning_timestep_integrator.parameters()) + list(self.integrating_conv.parameters()), + 'time_embed': list(self.time_embed.parameters()), + } + return groups + + def get_conditioning(self, conditioning_input): + speech_conditioning_input = conditioning_input.unsqueeze(1) if len( + conditioning_input.shape) == 3 else conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.contextual_embedder(speech_conditioning_input[:, j])) + conds = torch.cat(conds, dim=-1) + conds = conds.mean(dim=-1) + return conds + + def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred): + # Shuffle aligned_latent to BxCxS format + if is_latent(aligned_conditioning): + aligned_conditioning = aligned_conditioning.permute(0, 2, 1) + + cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1) + if is_latent(aligned_conditioning): + code_emb = self.latent_conditioner(aligned_conditioning) + else: + code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1) + code_emb = self.code_converter(code_emb) + code_emb = self.code_norm(code_emb) * (1 + cond_scale.unsqueeze(-1)) + cond_shift.unsqueeze(-1) + + unconditioned_batches = torch.zeros((code_emb.shape[0], 1, 1), device=code_emb.device) + # Mask out the conditioning branch for whole batch elements, implementing something similar to classifier-free guidance. + if self.training and self.unconditioned_percentage > 0: + unconditioned_batches = torch.rand((code_emb.shape[0], 1, 1), + device=code_emb.device) < self.unconditioned_percentage + code_emb = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1), + code_emb) + expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode='nearest') + + if not return_code_pred: + return expanded_code_emb + else: + mel_pred = self.mel_head(expanded_code_emb) + # Multiply mel_pred by !unconditioned_branches, which drops the gradient on unconditioned branches. This is because we don't want that gradient being used to train parameters through the codes_embedder as it unbalances contributions to that network from the MSE loss. + mel_pred = mel_pred * unconditioned_batches.logical_not() + return expanded_code_emb, mel_pred + + def forward(self, x, timesteps, aligned_conditioning=None, conditioning_latent=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced. + :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning(). + :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent() + :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered. + :return: an [N x C x ...] Tensor of outputs. + """ + assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and conditioning_latent is not None) + assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive. + + unused_params = [] + if conditioning_free: + code_emb = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1]) + unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) + unused_params.extend(list(self.latent_conditioner.parameters())) + else: + if precomputed_aligned_embeddings is not None: + code_emb = precomputed_aligned_embeddings + else: + code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_latent, x.shape[-1], True) + if is_latent(aligned_conditioning): + unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) + else: + unused_params.extend(list(self.latent_conditioner.parameters())) + + unused_params.append(self.unconditioned_embedding) + + time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + code_emb = self.conditioning_timestep_integrator(code_emb, time_emb) + x = self.inp_block(x) + x = torch.cat([x, code_emb], dim=1) + x = self.integrating_conv(x) + for i, lyr in enumerate(self.layers): + # Do layer drop where applicable. Do not drop first and last layers. + if self.training and self.layer_drop > 0 and i != 0 and i != (len(self.layers)-1) and random.random() < self.layer_drop: + unused_params.extend(list(lyr.parameters())) + else: + # First and last blocks will have autocast disabled for improved precision. + with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): + x = lyr(x, time_emb) + + x = x.float() + out = self.out(x) + + # Involve probabilistic or possibly unused parameters in loss so we don't get DDP errors. + extraneous_addition = 0 + for p in unused_params: + extraneous_addition = extraneous_addition + p.mean() + out = out + extraneous_addition * 0 + + if return_code_pred: + return out, mel_pred + return out + + +if __name__ == '__main__': + clip = torch.randn(2, 100, 400) + aligned_latent = torch.randn(2,388,512) + aligned_sequence = torch.randint(0,8192,(2,100)) + cond = torch.randn(2, 100, 400) + ts = torch.LongTensor([600, 600]) + model = DiffusionTts(512, layer_drop=.3, unconditioned_percentage=.5) + # Test with latent aligned conditioning + #o = model(clip, ts, aligned_latent, cond) + # Test with sequence aligned conditioning + o = model(clip, ts, aligned_sequence, cond) + diff --git a/tortoise/models/random_latent_generator.py b/tortoise/models/random_latent_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e90ef2130a47ec52160709877972716352e04c9c --- /dev/null +++ b/tortoise/models/random_latent_generator.py @@ -0,0 +1,55 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5): + if bias is not None: + rest_dim = [1] * (input.ndim - bias.ndim - 1) + return ( + F.leaky_relu( + input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=negative_slope + ) + * scale + ) + else: + return F.leaky_relu(input, negative_slope=0.2) * scale + + +class EqualLinear(nn.Module): + def __init__( + self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1 + ): + super().__init__() + self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) + if bias: + self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) + else: + self.bias = None + self.scale = (1 / math.sqrt(in_dim)) * lr_mul + self.lr_mul = lr_mul + + def forward(self, input): + out = F.linear(input, self.weight * self.scale) + out = fused_leaky_relu(out, self.bias * self.lr_mul) + return out + + +class RandomLatentConverter(nn.Module): + def __init__(self, channels): + super().__init__() + self.layers = nn.Sequential(*[EqualLinear(channels, channels, lr_mul=.1) for _ in range(5)], + nn.Linear(channels, channels)) + self.channels = channels + + def forward(self, ref): + r = torch.randn(ref.shape[0], self.channels, device=ref.device) + y = self.layers(r) + return y + + +if __name__ == '__main__': + model = RandomLatentConverter(512) + model(torch.randn(5,512)) \ No newline at end of file diff --git a/tortoise/models/transformer.py b/tortoise/models/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..aa59b462a3f9c2680f28ceb1b87480258f0293f0 --- /dev/null +++ b/tortoise/models/transformer.py @@ -0,0 +1,219 @@ +from functools import partial + +import torch +import torch.nn.functional as F +from einops import rearrange +from rotary_embedding_torch import RotaryEmbedding, broadcat +from torch import nn + + +# helpers + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val, depth = 1): + if isinstance(val, list): + val = tuple(val) + return val if isinstance(val, tuple) else (val,) * depth + + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + + +def stable_softmax(t, dim = -1, alpha = 32 ** 2): + t = t / alpha + t = t - torch.amax(t, dim = dim, keepdim = True).detach() + return (t * alpha).softmax(dim = dim) + + +def route_args(router, args, depth): + routed_args = [(dict(), dict()) for _ in range(depth)] + matched_keys = [key for key in args.keys() if key in router] + + for key in matched_keys: + val = args[key] + for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])): + new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes) + routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args}) + return routed_args + + +# classes +class SequentialSequence(nn.Module): + def __init__(self, layers, args_route = {}, layer_dropout = 0.): + super().__init__() + assert all(len(route) == len(layers) for route in args_route.values()), 'each argument route map must have the same depth as the number of sequential layers' + self.layers = layers + self.args_route = args_route + self.layer_dropout = layer_dropout + + def forward(self, x, **kwargs): + args = route_args(self.args_route, kwargs, len(self.layers)) + layers_and_args = list(zip(self.layers, args)) + + for (f, g), (f_args, g_args) in layers_and_args: + x = x + f(x, **f_args) + x = x + g(x, **g_args) + return x + + +class DivideMax(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + maxes = x.amax(dim = self.dim, keepdim = True).detach() + return x / maxes + + +# https://arxiv.org/abs/2103.17239 +class LayerScale(nn.Module): + def __init__(self, dim, depth, fn): + super().__init__() + if depth <= 18: + init_eps = 0.1 + elif depth > 18 and depth <= 24: + init_eps = 1e-5 + else: + init_eps = 1e-6 + + scale = torch.zeros(1, 1, dim).fill_(init_eps) + self.scale = nn.Parameter(scale) + self.fn = fn + def forward(self, x, **kwargs): + return self.fn(x, **kwargs) * self.scale + +# layer norm + + +class PreNorm(nn.Module): + def __init__(self, dim, fn, sandwich = False): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.norm_out = nn.LayerNorm(dim) if sandwich else nn.Identity() + self.fn = fn + + def forward(self, x, **kwargs): + x = self.norm(x) + x = self.fn(x, **kwargs) + return self.norm_out(x) + +# feed forward + + +class GEGLU(nn.Module): + def forward(self, x): + x, gates = x.chunk(2, dim = -1) + return x * F.gelu(gates) + + +class FeedForward(nn.Module): + def __init__(self, dim, dropout = 0., mult = 4.): + super().__init__() + self.net = nn.Sequential( + nn.Linear(dim, dim * mult * 2), + GEGLU(), + nn.Dropout(dropout), + nn.Linear(dim * mult, dim) + ) + + def forward(self, x): + return self.net(x) + +# Attention + + +class Attention(nn.Module): + def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0.): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.seq_len = seq_len + self.scale = dim_head ** -0.5 + + self.causal = causal + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) + self.to_out = nn.Sequential( + nn.Linear(inner_dim, dim), + nn.Dropout(dropout) + ) + + def forward(self, x, mask = None): + b, n, _, h, device = *x.shape, self.heads, x.device + softmax = torch.softmax + + qkv = self.to_qkv(x).chunk(3, dim = -1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv) + + q = q * self.scale + + dots = torch.einsum('b h i d, b h j d -> b h i j', q, k) + mask_value = max_neg_value(dots) + + if exists(mask): + mask = rearrange(mask, 'b j -> b () () j') + dots.masked_fill_(~mask, mask_value) + del mask + + if self.causal: + i, j = dots.shape[-2:] + mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool() + dots.masked_fill_(mask, mask_value) + + attn = softmax(dots, dim=-1) + + out = torch.einsum('b h i j, b h j d -> b h i d', attn, v) + out = rearrange(out, 'b h n d -> b n (h d)') + out = self.to_out(out) + return out + + +# main transformer class +class Transformer(nn.Module): + def __init__( + self, + *, + dim, + depth, + seq_len, + causal = True, + heads = 8, + dim_head = 64, + ff_mult = 4, + attn_dropout = 0., + ff_dropout = 0., + sparse_attn = False, + sandwich_norm = False, + ): + super().__init__() + layers = nn.ModuleList([]) + sparse_layer = cast_tuple(sparse_attn, depth) + + for ind, sparse_attn in zip(range(depth), sparse_layer): + attn = Attention(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout) + + ff = FeedForward(dim, mult = ff_mult, dropout = ff_dropout) + + layers.append(nn.ModuleList([ + LayerScale(dim, ind + 1, PreNorm(dim, attn, sandwich = sandwich_norm)), + LayerScale(dim, ind + 1, PreNorm(dim, ff, sandwich = sandwich_norm)) + ])) + + execute_type = SequentialSequence + route_attn = ((True, False),) * depth + attn_route_map = {'mask': route_attn} + + self.layers = execute_type(layers, args_route = attn_route_map) + + def forward(self, x, **kwargs): + return self.layers(x, **kwargs) \ No newline at end of file diff --git a/tortoise/models/vocoder.py b/tortoise/models/vocoder.py new file mode 100644 index 0000000000000000000000000000000000000000..d38fb56699c035b3d4a86ace67c567d3f1d51fa9 --- /dev/null +++ b/tortoise/models/vocoder.py @@ -0,0 +1,325 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +MAX_WAV_VALUE = 32768.0 + +class KernelPredictor(torch.nn.Module): + ''' Kernel predictor for the location-variable convolutions''' + + def __init__( + self, + cond_channels, + conv_in_channels, + conv_out_channels, + conv_layers, + conv_kernel_size=3, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + kpnet_nonlinear_activation="LeakyReLU", + kpnet_nonlinear_activation_params={"negative_slope": 0.1}, + ): + ''' + Args: + cond_channels (int): number of channel for the conditioning sequence, + conv_in_channels (int): number of channel for the input sequence, + conv_out_channels (int): number of channel for the output sequence, + conv_layers (int): number of layers + ''' + super().__init__() + + self.conv_in_channels = conv_in_channels + self.conv_out_channels = conv_out_channels + self.conv_kernel_size = conv_kernel_size + self.conv_layers = conv_layers + + kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w + kpnet_bias_channels = conv_out_channels * conv_layers # l_b + + self.input_conv = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.residual_convs = nn.ModuleList() + padding = (kpnet_conv_size - 1) // 2 + for _ in range(3): + self.residual_convs.append( + nn.Sequential( + nn.Dropout(kpnet_dropout), + nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, + bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, + bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + ) + self.kernel_conv = nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_kernel_channels, kpnet_conv_size, padding=padding, bias=True)) + self.bias_conv = nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_bias_channels, kpnet_conv_size, padding=padding, bias=True)) + + def forward(self, c): + ''' + Args: + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + ''' + batch, _, cond_length = c.shape + c = self.input_conv(c) + for residual_conv in self.residual_convs: + residual_conv.to(c.device) + c = c + residual_conv(c) + k = self.kernel_conv(c) + b = self.bias_conv(c) + kernels = k.contiguous().view( + batch, + self.conv_layers, + self.conv_in_channels, + self.conv_out_channels, + self.conv_kernel_size, + cond_length, + ) + bias = b.contiguous().view( + batch, + self.conv_layers, + self.conv_out_channels, + cond_length, + ) + + return kernels, bias + + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.input_conv[0]) + nn.utils.remove_weight_norm(self.kernel_conv) + nn.utils.remove_weight_norm(self.bias_conv) + for block in self.residual_convs: + nn.utils.remove_weight_norm(block[1]) + nn.utils.remove_weight_norm(block[3]) + + +class LVCBlock(torch.nn.Module): + '''the location-variable convolutions''' + + def __init__( + self, + in_channels, + cond_channels, + stride, + dilations=[1, 3, 9, 27], + lReLU_slope=0.2, + conv_kernel_size=3, + cond_hop_length=256, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + ): + super().__init__() + + self.cond_hop_length = cond_hop_length + self.conv_layers = len(dilations) + self.conv_kernel_size = conv_kernel_size + + self.kernel_predictor = KernelPredictor( + cond_channels=cond_channels, + conv_in_channels=in_channels, + conv_out_channels=2 * in_channels, + conv_layers=len(dilations), + conv_kernel_size=conv_kernel_size, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=kpnet_dropout, + kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope} + ) + + self.convt_pre = nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.ConvTranspose1d(in_channels, in_channels, 2 * stride, stride=stride, + padding=stride // 2 + stride % 2, output_padding=stride % 2)), + ) + + self.conv_blocks = nn.ModuleList() + for dilation in dilations: + self.conv_blocks.append( + nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.Conv1d(in_channels, in_channels, conv_kernel_size, + padding=dilation * (conv_kernel_size - 1) // 2, dilation=dilation)), + nn.LeakyReLU(lReLU_slope), + ) + ) + + def forward(self, x, c): + ''' forward propagation of the location-variable convolutions. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length) + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + + Returns: + Tensor: the output sequence (batch, in_channels, in_length) + ''' + _, in_channels, _ = x.shape # (B, c_g, L') + + x = self.convt_pre(x) # (B, c_g, stride * L') + kernels, bias = self.kernel_predictor(c) + + for i, conv in enumerate(self.conv_blocks): + output = conv(x) # (B, c_g, stride * L') + + k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) + b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) + + output = self.location_variable_convolution(output, k, b, + hop_size=self.cond_hop_length) # (B, 2 * c_g, stride * L'): LVC + x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( + output[:, in_channels:, :]) # (B, c_g, stride * L'): GAU + + return x + + def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): + ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. + Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length). + kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) + bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) + dilation (int): the dilation of convolution. + hop_size (int): the hop_size of the conditioning sequence. + Returns: + (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). + ''' + batch, _, in_length = x.shape + batch, _, out_channels, kernel_size, kernel_length = kernel.shape + assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" + + padding = dilation * int((kernel_size - 1) / 2) + x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding) + x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) + + if hop_size < dilation: + x = F.pad(x, (0, dilation), 'constant', 0) + x = x.unfold(3, dilation, + dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) + x = x[:, :, :, :, :hop_size] + x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) + x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) + + o = torch.einsum('bildsk,biokl->bolsd', x, kernel) + o = o.to(memory_format=torch.channels_last_3d) + bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) + o = o + bias + o = o.contiguous().view(batch, out_channels, -1) + + return o + + def remove_weight_norm(self): + self.kernel_predictor.remove_weight_norm() + nn.utils.remove_weight_norm(self.convt_pre[1]) + for block in self.conv_blocks: + nn.utils.remove_weight_norm(block[1]) + + +class UnivNetGenerator(nn.Module): + """UnivNet Generator""" + + def __init__(self, noise_dim=64, channel_size=32, dilations=[1,3,9,27], strides=[8,8,4], lReLU_slope=.2, kpnet_conv_size=3, + # Below are MEL configurations options that this generator requires. + hop_length=256, n_mel_channels=100): + super(UnivNetGenerator, self).__init__() + self.mel_channel = n_mel_channels + self.noise_dim = noise_dim + self.hop_length = hop_length + channel_size = channel_size + kpnet_conv_size = kpnet_conv_size + + self.res_stack = nn.ModuleList() + hop_length = 1 + for stride in strides: + hop_length = stride * hop_length + self.res_stack.append( + LVCBlock( + channel_size, + n_mel_channels, + stride=stride, + dilations=dilations, + lReLU_slope=lReLU_slope, + cond_hop_length=hop_length, + kpnet_conv_size=kpnet_conv_size + ) + ) + + self.conv_pre = \ + nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode='reflect')) + + self.conv_post = nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode='reflect')), + nn.Tanh(), + ) + + def forward(self, c, z): + ''' + Args: + c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length) + z (Tensor): the noise sequence (batch, noise_dim, in_length) + + ''' + z = self.conv_pre(z) # (B, c_g, L) + + for res_block in self.res_stack: + res_block.to(z.device) + z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i) + + z = self.conv_post(z) # (B, 1, L * 256) + + return z + + def eval(self, inference=False): + super(UnivNetGenerator, self).eval() + # don't remove weight norm while validation in training loop + if inference: + self.remove_weight_norm() + + def remove_weight_norm(self): + print('Removing weight norm...') + + nn.utils.remove_weight_norm(self.conv_pre) + + for layer in self.conv_post: + if len(layer.state_dict()) != 0: + nn.utils.remove_weight_norm(layer) + + for res_block in self.res_stack: + res_block.remove_weight_norm() + + def inference(self, c, z=None): + # pad input mel with zeros to cut artifact + # see https://github.com/seungwonpark/melgan/issues/8 + zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) + mel = torch.cat((c, zero), dim=2) + + if z is None: + z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) + + audio = self.forward(mel, z) + audio = audio[:, :, :-(self.hop_length * 10)] + audio = audio.clamp(min=-1, max=1) + return audio + + +if __name__ == '__main__': + model = UnivNetGenerator() + + c = torch.randn(3, 100, 10) + z = torch.randn(3, 64, 10) + print(c.shape) + + y = model(c, z) + print(y.shape) + assert y.shape == torch.Size([3, 1, 2560]) + + pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(pytorch_total_params) diff --git a/tortoise/models/xtransformers.py b/tortoise/models/xtransformers.py new file mode 100644 index 0000000000000000000000000000000000000000..df9ee25131ffae50047fe4bbc7659c67de6537a3 --- /dev/null +++ b/tortoise/models/xtransformers.py @@ -0,0 +1,1252 @@ +import functools +import math +import torch +from torch import nn, einsum +import torch.nn.functional as F +from functools import partial +from inspect import isfunction +from collections import namedtuple + +from einops import rearrange, repeat, reduce +from einops.layers.torch import Rearrange + +from torch.utils.checkpoint import checkpoint + +DEFAULT_DIM_HEAD = 64 + +Intermediates = namedtuple('Intermediates', [ + 'pre_softmax_attn', + 'post_softmax_attn' +]) + +LayerIntermediates = namedtuple('Intermediates', [ + 'hiddens', + 'attn_intermediates', + 'past_key_values', +]) + + +# helpers + +def exists(val): + return val is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def cast_tuple(val, depth): + return val if isinstance(val, tuple) else (val,) * depth + + +class always(): + def __init__(self, val): + self.val = val + + def __call__(self, *args, **kwargs): + return self.val + + +class not_equals(): + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x != self.val + + +class equals(): + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x == self.val + + +def max_neg_value(tensor): + return -torch.finfo(tensor.dtype).max + + +def l2norm(t): + return F.normalize(t, p=2, dim=-1) + + +# init helpers + +def init_zero_(layer): + nn.init.constant_(layer.weight, 0.) + if exists(layer.bias): + nn.init.constant_(layer.bias, 0.) + + +# keyword argument helpers + +def pick_and_pop(keys, d): + values = list(map(lambda key: d.pop(key), keys)) + return dict(zip(keys, values)) + + +def group_dict_by_key(cond, d): + return_val = [dict(), dict()] + for key in d.keys(): + match = bool(cond(key)) + ind = int(not match) + return_val[ind][key] = d[key] + return (*return_val,) + + +def string_begins_with(prefix, str): + return str.startswith(prefix) + + +def group_by_key_prefix(prefix, d): + return group_dict_by_key(partial(string_begins_with, prefix), d) + + +def groupby_prefix_and_trim(prefix, d): + kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) + kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items()))) + return kwargs_without_prefix, kwargs + + +# activations + +class ReluSquared(nn.Module): + def forward(self, x): + return F.relu(x) ** 2 + + +# positional embeddings + +class AbsolutePositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + self.scale = dim ** -0.5 + self.emb = nn.Embedding(max_seq_len, dim) + + def forward(self, x): + n = torch.arange(x.shape[1], device=x.device) + pos_emb = self.emb(n) + pos_emb = rearrange(pos_emb, 'n d -> () n d') + return pos_emb * self.scale + + +class FixedPositionalEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, x, seq_dim=1, offset=0): + t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset + sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) + return rearrange(emb, 'n d -> () n d') + + +class RelativePositionBias(nn.Module): + def __init__(self, scale, causal=False, num_buckets=32, max_distance=128, heads=8): + super().__init__() + self.scale = scale + self.causal = causal + self.num_buckets = num_buckets + self.max_distance = max_distance + self.relative_attention_bias = nn.Embedding(num_buckets, heads) + + @staticmethod + def _relative_position_bucket(relative_position, causal=True, num_buckets=32, max_distance=128): + ret = 0 + n = -relative_position + if not causal: + num_buckets //= 2 + ret += (n < 0).long() * num_buckets + n = torch.abs(n) + else: + n = torch.max(n, torch.zeros_like(n)) + + max_exact = num_buckets // 2 + is_small = n < max_exact + + val_if_large = max_exact + ( + torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + ).long() + val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) + + ret += torch.where(is_small, n, val_if_large) + return ret + + def forward(self, qk_dots): + i, j, device = *qk_dots.shape[-2:], qk_dots.device + q_pos = torch.arange(i, dtype=torch.long, device=device) + k_pos = torch.arange(j, dtype=torch.long, device=device) + rel_pos = k_pos[None, :] - q_pos[:, None] + rp_bucket = self._relative_position_bucket(rel_pos, causal=self.causal, num_buckets=self.num_buckets, + max_distance=self.max_distance) + values = self.relative_attention_bias(rp_bucket) + bias = rearrange(values, 'i j h -> () h i j') + return qk_dots + (bias * self.scale) + + +class AlibiPositionalBias(nn.Module): + def __init__(self, heads, **kwargs): + super().__init__() + self.heads = heads + slopes = torch.Tensor(self._get_slopes(heads)) + slopes = rearrange(slopes, 'h -> () h () ()') + self.register_buffer('slopes', slopes, persistent=False) + self.register_buffer('bias', None, persistent=False) + + @staticmethod + def _get_slopes(heads): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(heads).is_integer(): + return get_slopes_power_of_2(heads) + + closest_power_of_2 = 2 ** math.floor(math.log2(heads)) + return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][ + :heads - closest_power_of_2] + + def forward(self, qk_dots): + h, i, j, device = *qk_dots.shape[-3:], qk_dots.device + + if exists(self.bias) and self.bias.shape[-1] >= j: + return qk_dots + self.bias[..., :j] + + bias = torch.arange(j, device=device) + bias = rearrange(bias, 'j -> () () () j') + bias = bias * self.slopes + + num_heads_unalibied = h - bias.shape[1] + bias = F.pad(bias, (0, 0, 0, 0, 0, num_heads_unalibied)) + + self.register_buffer('bias', bias, persistent=False) + return qk_dots + self.bias + + +class LearnedAlibiPositionalBias(AlibiPositionalBias): + def __init__(self, heads, bidirectional=False): + super().__init__(heads) + los_slopes = torch.log(self.slopes) + self.learned_logslopes = nn.Parameter(los_slopes) + + self.bidirectional = bidirectional + if self.bidirectional: + self.learned_logslopes_future = nn.Parameter(los_slopes) + + def forward(self, qk_dots): + h, i, j, device = *qk_dots.shape[-3:], qk_dots.device + + def get_slopes(param): + return F.pad(param.exp(), (0, 0, 0, 0, 0, h - param.shape[1])) + + if exists(self.bias) and self.bias.shape[-1] >= j: + bias = self.bias[..., :i, :j] + else: + i_arange = torch.arange(i, device=device) + j_arange = torch.arange(j, device=device) + bias = rearrange(j_arange, 'j -> 1 1 1 j') - rearrange(i_arange, 'i -> 1 1 i 1') + self.register_buffer('bias', bias, persistent=False) + + if self.bidirectional: + past_slopes = get_slopes(self.learned_logslopes) + future_slopes = get_slopes(self.learned_logslopes_future) + bias = torch.tril(bias * past_slopes) + torch.triu(bias * future_slopes) + else: + slopes = get_slopes(self.learned_logslopes) + bias = bias * slopes + + return qk_dots + bias + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, max_seq_len, device): + t = torch.arange(max_seq_len, device=device).type_as(self.inv_freq) + freqs = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + return rearrange(emb, 'n d -> () () n d') + + +def rotate_half(x): + x = rearrange(x, '... (j d) -> ... j d', j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(t, freqs): + seq_len = t.shape[-2] + freqs = freqs[:, :, -seq_len:] + return (t * freqs.cos()) + (rotate_half(t) * freqs.sin()) + + +# norms + +class Scale(nn.Module): + def __init__(self, value, fn): + super().__init__() + self.value = value + self.fn = fn + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + scale_fn = lambda t: t * self.value + + if not isinstance(out, tuple): + return scale_fn(out) + + return (scale_fn(out[0]), *out[1:]) + + +class Rezero(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = nn.Parameter(torch.zeros(1)) + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + rezero_fn = lambda t: t * self.g + + if not isinstance(out, tuple): + return rezero_fn(out) + + return (rezero_fn(out[0]), *out[1:]) + + +class ScaleNorm(nn.Module): + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(1)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSScaleShiftNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + self.scale_shift_process = nn.Linear(dim * 2, dim * 2) + + def forward(self, x, norm_scale_shift_inp): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + norm = x / norm.clamp(min=self.eps) * self.g + + ss_emb = self.scale_shift_process(norm_scale_shift_inp) + scale, shift = torch.chunk(ss_emb, 2, dim=1) + h = norm * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + return h + + +# residual and residual gates + +class Residual(nn.Module): + def __init__(self, dim, scale_residual=False): + super().__init__() + self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None + + def forward(self, x, residual): + if exists(self.residual_scale): + residual = residual * self.residual_scale + + return x + residual + + +class GRUGating(nn.Module): + def __init__(self, dim, scale_residual=False): + super().__init__() + self.gru = nn.GRUCell(dim, dim) + self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None + + def forward(self, x, residual): + if exists(self.residual_scale): + residual = residual * self.residual_scale + + gated_output = self.gru( + rearrange(x, 'b n d -> (b n) d'), + rearrange(residual, 'b n d -> (b n) d') + ) + + return gated_output.reshape_as(x) + + +# token shifting + +def shift(t, amount, mask=None): + if amount == 0: + return t + + if exists(mask): + t = t.masked_fill(~mask[..., None], 0.) + + return F.pad(t, (0, 0, amount, -amount), value=0.) + + +class ShiftTokens(nn.Module): + def __init__(self, shifts, fn): + super().__init__() + self.fn = fn + self.shifts = tuple(shifts) + + def forward(self, x, **kwargs): + mask = kwargs.get('mask', None) + shifts = self.shifts + segments = len(shifts) + feats_per_shift = x.shape[-1] // segments + splitted = x.split(feats_per_shift, dim=-1) + segments_to_shift, rest = splitted[:segments], splitted[segments:] + segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts))) + x = torch.cat((*segments_to_shift, *rest), dim=-1) + return self.fn(x, **kwargs) + + +# feedforward + +class GLU(nn.Module): + def __init__(self, dim_in, dim_out, activation): + super().__init__() + self.act = activation + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * self.act(gate) + + +class FeedForward(nn.Module): + def __init__( + self, + dim, + dim_out=None, + mult=4, + glu=False, + relu_squared=False, + post_act_ln=False, + dropout=0., + zero_init_output=False + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + activation = ReluSquared() if relu_squared else nn.GELU() + + project_in = nn.Sequential( + nn.Linear(dim, inner_dim), + activation + ) if not glu else GLU(dim, inner_dim, activation) + + self.net = nn.Sequential( + project_in, + nn.LayerNorm(inner_dim) if post_act_ln else nn.Identity(), + nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out) + ) + + # init last linear layer to 0 + if zero_init_output: + init_zero_(self.net[-1]) + + def forward(self, x): + return self.net(x) + + +# attention. + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_head=DEFAULT_DIM_HEAD, + heads=8, + causal=False, + talking_heads=False, + head_scale=False, + collab_heads=False, + collab_compression=.3, + sparse_topk=None, + use_entmax15=False, + num_mem_kv=0, + dropout=0., + on_attn=False, + gate_values=False, + zero_init_output=False, + max_attend_past=None, + qk_norm=False, + scale_init_value=None, + rel_pos_bias=False, + rel_pos_num_buckets=32, + rel_pos_max_distance=128, + ): + super().__init__() + self.scale = dim_head ** -0.5 + + self.heads = heads + self.causal = causal + self.max_attend_past = max_attend_past + + qk_dim = v_dim = dim_head * heads + + # collaborative heads + self.collab_heads = collab_heads + if self.collab_heads: + qk_dim = int(collab_compression * qk_dim) + self.collab_mixing = nn.Parameter(torch.randn(heads, qk_dim)) + + self.to_q = nn.Linear(dim, qk_dim, bias=False) + self.to_k = nn.Linear(dim, qk_dim, bias=False) + self.to_v = nn.Linear(dim, v_dim, bias=False) + + self.dropout = nn.Dropout(dropout) + + # add GLU gating for aggregated values, from alphafold2 + self.to_v_gate = None + if gate_values: + self.to_v_gate = nn.Linear(dim, v_dim) + nn.init.constant_(self.to_v_gate.weight, 0) + nn.init.constant_(self.to_v_gate.bias, 1) + + # cosine sim attention + self.qk_norm = qk_norm + if qk_norm: + scale_init_value = default(scale_init_value, + -3) # if not provided, initialize as though it were sequence length of 1024 + self.scale = nn.Parameter(torch.ones(1, heads, 1, 1) * scale_init_value) + + # talking heads + self.talking_heads = talking_heads + if talking_heads: + self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + + # head scaling + self.head_scale = head_scale + if head_scale: + self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1)) + + # explicit topk sparse attention + self.sparse_topk = sparse_topk + + # entmax + self.attn_fn = F.softmax + + # add memory key / values + self.num_mem_kv = num_mem_kv + if num_mem_kv > 0: + self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + + # attention on attention + self.attn_on_attn = on_attn + self.to_out = nn.Sequential(nn.Linear(v_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(v_dim, dim) + + self.rel_pos_bias = rel_pos_bias + if rel_pos_bias: + assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance' + self.rel_pos = RelativePositionBias(scale=dim_head ** 0.5, causal=causal, heads=heads, + num_buckets=rel_pos_num_buckets, max_distance=rel_pos_max_distance) + + # init output projection 0 + if zero_init_output: + init_zero_(self.to_out) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + attn_mask=None, + sinusoidal_emb=None, + rotary_pos_emb=None, + prev_attn=None, + mem=None, + layer_past=None, + ): + b, n, _, h, talking_heads, collab_heads, head_scale, scale, device, has_context = *x.shape, self.heads, self.talking_heads, self.collab_heads, self.head_scale, self.scale, x.device, exists( + context) + kv_input = default(context, x) + + q_input = x + k_input = kv_input + v_input = kv_input + + if exists(mem): + k_input = torch.cat((mem, k_input), dim=-2) + v_input = torch.cat((mem, v_input), dim=-2) + + if exists(sinusoidal_emb): + # in shortformer, the query would start at a position offset depending on the past cached memory + offset = k_input.shape[-2] - q_input.shape[-2] + q_input = q_input + sinusoidal_emb(q_input, offset=offset) + k_input = k_input + sinusoidal_emb(k_input) + + q = self.to_q(q_input) + k = self.to_k(k_input) + v = self.to_v(v_input) + + if not collab_heads: + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) + else: + q = einsum('b i d, h d -> b h i d', q, self.collab_mixing) + k = rearrange(k, 'b n d -> b () n d') + v = rearrange(v, 'b n (h d) -> b h n d', h=h) + + if layer_past is not None: + past_key, past_value = layer_past + k = torch.cat([past_key, k], dim=-2) + v = torch.cat([past_value, v], dim=-2) + k_cache = k + v_cache = v + + if exists(rotary_pos_emb) and not has_context: + l = rotary_pos_emb.shape[-1] + (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v)) + ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl)) + q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr))) + + input_mask = None + if any(map(exists, (mask, context_mask))): + q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool()) + k_mask = q_mask if not exists(context) else context_mask + k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool()) + q_mask = rearrange(q_mask, 'b i -> b () i ()') + k_mask = rearrange(k_mask, 'b j -> b () () j') + input_mask = q_mask * k_mask + + if self.num_mem_kv > 0: + mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v)) + k = torch.cat((mem_k, k), dim=-2) + v = torch.cat((mem_v, v), dim=-2) + if exists(input_mask): + input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True) + + if collab_heads: + k = k.expand(-1, h, -1, -1) + + if self.qk_norm: + q, k = map(l2norm, (q, k)) + scale = 1 / (self.scale.exp().clamp(min=1e-2)) + + dots = einsum('b h i d, b h j d -> b h i j', q, k) * scale + mask_value = max_neg_value(dots) + + if exists(prev_attn): + dots = dots + prev_attn + + pre_softmax_attn = dots.clone() + + if talking_heads: + dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous() + + if self.rel_pos_bias: + dots = self.rel_pos(dots) + + if exists(input_mask): + dots.masked_fill_(~input_mask, mask_value) + del input_mask + + if exists(attn_mask): + assert 2 <= attn_mask.ndim <= 4, 'attention mask must have greater than 2 dimensions but less than or equal to 4' + if attn_mask.ndim == 2: + attn_mask = rearrange(attn_mask, 'i j -> () () i j') + elif attn_mask.ndim == 3: + attn_mask = rearrange(attn_mask, 'h i j -> () h i j') + dots.masked_fill_(~attn_mask, mask_value) + + if exists(self.max_attend_past): + i, j = dots.shape[-2:] + range_q = torch.arange(j - i, j, device=device) + range_k = torch.arange(j, device=device) + dist = rearrange(range_q, 'i -> () () i ()') - rearrange(range_k, 'j -> () () () j') + mask = dist > self.max_attend_past + dots.masked_fill_(mask, mask_value) + del mask + + if self.causal: + i, j = dots.shape[-2:] + r = torch.arange(i, device=device) + mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j') + mask = F.pad(mask, (j - i, 0), value=False) + dots.masked_fill_(mask, mask_value) + del mask + + if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: + top, _ = dots.topk(self.sparse_topk, dim=-1) + vk = top[..., -1].unsqueeze(-1).expand_as(dots) + mask = dots < vk + dots.masked_fill_(mask, mask_value) + del mask + + attn = self.attn_fn(dots, dim=-1) + post_softmax_attn = attn.clone() + + attn = self.dropout(attn) + + if talking_heads: + attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous() + + out = einsum('b h i j, b h j d -> b h i d', attn, v) + + if head_scale: + out = out * self.head_scale_params + + out = rearrange(out, 'b h n d -> b n (h d)') + + if exists(self.to_v_gate): + gates = self.to_v_gate(x) + out = out * gates.sigmoid() + + intermediates = Intermediates( + pre_softmax_attn=pre_softmax_attn, + post_softmax_attn=post_softmax_attn + ) + + return self.to_out(out), intermediates, k_cache, v_cache + + +class AttentionLayers(nn.Module): + def __init__( + self, + dim, + depth, + heads=8, + causal=False, + cross_attend=False, + only_cross=False, + use_scalenorm=False, + use_rms_scaleshift_norm=False, + use_rmsnorm=False, + use_rezero=False, + alibi_pos_bias=False, + alibi_num_heads=None, + alibi_learned=False, + position_infused_attn=False, + rotary_pos_emb=False, + rotary_emb_dim=None, + custom_layers=None, + sandwich_coef=None, + par_ratio=None, + residual_attn=False, + cross_residual_attn=False, + macaron=False, + pre_norm=True, + gate_residual=False, + scale_residual=False, + shift_tokens=0, + sandwich_norm=False, + use_qk_norm_attn=False, + qk_norm_attn_seq_len=None, + zero_init_branch_output=False, + **kwargs + ): + super().__init__() + ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs) + attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs) + + dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD) + + self.dim = dim + self.depth = depth + self.layers = nn.ModuleList([]) + self.causal = causal + + rel_pos_bias = 'rel_pos_bias' in attn_kwargs + self.has_pos_emb = position_infused_attn or rel_pos_bias or rotary_pos_emb + self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None + + rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) + self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None + + assert not ( + alibi_pos_bias and rel_pos_bias), 'you can only choose Alibi positional bias or T5 relative positional bias, not both' + + if alibi_pos_bias: + alibi_num_heads = default(alibi_num_heads, heads) + assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads' + alibi_pos_klass = LearnedAlibiPositionalBias if alibi_learned or not causal else AlibiPositionalBias + self.rel_pos = alibi_pos_klass(heads=alibi_num_heads, bidirectional=not causal) + else: + self.rel_pos = None + + assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm' + self.pre_norm = pre_norm + self.sandwich_norm = sandwich_norm + + self.residual_attn = residual_attn + self.cross_residual_attn = cross_residual_attn + self.cross_attend = cross_attend + + norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm + norm_class = RMSNorm if use_rmsnorm else norm_class + norm_class = RMSScaleShiftNorm if use_rms_scaleshift_norm else norm_class + norm_fn = partial(norm_class, dim) + + norm_fn = nn.Identity if use_rezero else norm_fn + branch_fn = Rezero if use_rezero else None + + if cross_attend and not only_cross: + default_block = ('a', 'c', 'f') + elif cross_attend and only_cross: + default_block = ('c', 'f') + else: + default_block = ('a', 'f') + + if macaron: + default_block = ('f',) + default_block + + # qk normalization + + if use_qk_norm_attn: + attn_scale_init_value = -math.log(math.log2(qk_norm_attn_seq_len ** 2 - qk_norm_attn_seq_len)) if exists( + qk_norm_attn_seq_len) else None + attn_kwargs = {**attn_kwargs, 'qk_norm': True, 'scale_init_value': attn_scale_init_value} + + # zero init + + if zero_init_branch_output: + attn_kwargs = {**attn_kwargs, 'zero_init_output': True} + ff_kwargs = {**ff_kwargs, 'zero_init_output': True} + + # calculate layer block order + + if exists(custom_layers): + layer_types = custom_layers + elif exists(par_ratio): + par_depth = depth * len(default_block) + assert 1 < par_ratio <= par_depth, 'par ratio out of range' + default_block = tuple(filter(not_equals('f'), default_block)) + par_attn = par_depth // par_ratio + depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper + par_width = (depth_cut + depth_cut // par_attn) // par_attn + assert len(default_block) <= par_width, 'default block is too large for par_ratio' + par_block = default_block + ('f',) * (par_width - len(default_block)) + par_head = par_block * par_attn + layer_types = par_head + ('f',) * (par_depth - len(par_head)) + elif exists(sandwich_coef): + assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth' + layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef + else: + layer_types = default_block * depth + + self.layer_types = layer_types + self.num_attn_layers = len(list(filter(equals('a'), layer_types))) + + # calculate token shifting + + shift_tokens = cast_tuple(shift_tokens, len(layer_types)) + + # iterate and construct layers + + for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)): + is_last_layer = ind == (len(self.layer_types) - 1) + + if layer_type == 'a': + layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs) + elif layer_type == 'c': + layer = Attention(dim, heads=heads, **attn_kwargs) + elif layer_type == 'f': + layer = FeedForward(dim, **ff_kwargs) + layer = layer if not macaron else Scale(0.5, layer) + else: + raise Exception(f'invalid layer type {layer_type}') + + if layer_shift_tokens > 0: + shift_range_upper = layer_shift_tokens + 1 + shift_range_lower = -layer_shift_tokens if not causal else 0 + layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer) + + if exists(branch_fn): + layer = branch_fn(layer) + + residual_fn = GRUGating if gate_residual else Residual + residual = residual_fn(dim, scale_residual=scale_residual) + + layer_uses_qk_norm = use_qk_norm_attn and layer_type in ('a', 'c') + + pre_branch_norm = norm_fn() if pre_norm and not layer_uses_qk_norm else None + post_branch_norm = norm_fn() if sandwich_norm or layer_uses_qk_norm else None + post_main_norm = norm_fn() if not pre_norm and not is_last_layer else None + + norms = nn.ModuleList([ + pre_branch_norm, + post_branch_norm, + post_main_norm + ]) + + self.layers.append(nn.ModuleList([ + norms, + layer, + residual + ])) + + def forward( + self, + x, + context=None, + full_context=None, # for passing a list of hidden states from an encoder + mask=None, + context_mask=None, + attn_mask=None, + mems=None, + return_hiddens=False, + norm_scale_shift_inp=None, + past_key_values=None, + expected_seq_len=None, + ): + + assert not (self.cross_attend ^ (exists(context) or exists( + full_context))), 'context must be passed in if cross_attend is set to True' + assert context is None or full_context is None, 'only one of full_context or context can be provided' + + hiddens = [] + intermediates = [] + prev_attn = None + prev_cross_attn = None + + mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers + norm_args = {} + if exists(norm_scale_shift_inp): + norm_args['norm_scale_shift_inp'] = norm_scale_shift_inp + + rotary_pos_emb = None + if exists(self.rotary_pos_emb): + if not self.training and self.causal: + assert expected_seq_len is not None, "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + elif expected_seq_len is None: + expected_seq_len = 0 + seq_len = x.shape[1] + if past_key_values is not None: + seq_len += past_key_values[0][0].shape[-2] + max_rotary_emb_length = max(list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len]) + rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device) + + present_key_values = [] + cross_attn_count = 0 + for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)): + if layer_type == 'a': + layer_mem = mems.pop(0) if mems else None + + residual = x + + pre_branch_norm, post_branch_norm, post_main_norm = norm + + if exists(pre_branch_norm): + x = pre_branch_norm(x, **norm_args) + + if layer_type == 'a' or layer_type == 'c': + if past_key_values is not None: + layer_kv = past_key_values.pop(0) + layer_past = tuple(s.to(x.device) for s in layer_kv) + else: + layer_past = None + + if layer_type == 'a': + out, inter, k, v = checkpoint(block, x, None, mask, None, attn_mask, self.pia_pos_emb, rotary_pos_emb, + prev_attn, layer_mem, layer_past) + elif layer_type == 'c': + if exists(full_context): + out, inter, k, v = checkpoint(block, x, full_context[cross_attn_count], mask, context_mask, None, None, + None, prev_attn, None, layer_past) + else: + out, inter, k, v = checkpoint(block, x, context, mask, context_mask, None, None, None, prev_attn, None, layer_past) + elif layer_type == 'f': + out = checkpoint(block, x) + + if layer_type == 'a' or layer_type == 'c' and present_key_values is not None: + present_key_values.append((k.detach(), v.detach())) + + if exists(post_branch_norm): + out = post_branch_norm(out, **norm_args) + + x = residual_fn(out, residual) + + if layer_type in ('a', 'c'): + intermediates.append(inter) + + if layer_type == 'a' and self.residual_attn: + prev_attn = inter.pre_softmax_attn + elif layer_type == 'c' and self.cross_residual_attn: + prev_cross_attn = inter.pre_softmax_attn + + if exists(post_main_norm): + x = post_main_norm(x, **norm_args) + + if layer_type == 'c': + cross_attn_count += 1 + + if layer_type == 'f': + hiddens.append(x) + + if return_hiddens: + intermediates = LayerIntermediates( + hiddens=hiddens, + attn_intermediates=intermediates, + past_key_values=present_key_values + ) + + return x, intermediates + + return x + + +class Encoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on encoder' + super().__init__(causal=False, **kwargs) + + +class Decoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on decoder' + super().__init__(causal=True, **kwargs) + + +class CrossAttender(AttentionLayers): + def __init__(self, **kwargs): + super().__init__(cross_attend=True, only_cross=True, **kwargs) + + +class ViTransformerWrapper(nn.Module): + def __init__( + self, + *, + image_size, + patch_size, + attn_layers, + num_classes=None, + dropout=0., + emb_dropout=0. + ): + super().__init__() + assert isinstance(attn_layers, Encoder), 'attention layers must be an Encoder' + assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size' + dim = attn_layers.dim + num_patches = (image_size // patch_size) ** 2 + patch_dim = 3 * patch_size ** 2 + + self.patch_size = patch_size + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) + self.patch_to_embedding = nn.Linear(patch_dim, dim) + self.cls_token = nn.Parameter(torch.randn(1, 1, dim)) + self.dropout = nn.Dropout(emb_dropout) + + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + self.mlp_head = FeedForward(dim, dim_out=num_classes, dropout=dropout) if exists(num_classes) else None + + def forward( + self, + img, + return_embeddings=False + ): + p = self.patch_size + + x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p) + x = self.patch_to_embedding(x) + b, n, _ = x.shape + + cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embedding[:, :(n + 1)] + x = self.dropout(x) + + x = self.attn_layers(x) + x = self.norm(x) + + if not exists(self.mlp_head) or return_embeddings: + return x + + return self.mlp_head(x[:, 0]) + + +class TransformerWrapper(nn.Module): + def __init__( + self, + *, + num_tokens, + max_seq_len, + attn_layers, + emb_dim=None, + max_mem_len=0., + shift_mem_down=0, + emb_dropout=0., + num_memory_tokens=None, + tie_embedding=False, + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + emb_dim = default(emb_dim, dim) + + self.max_seq_len = max_seq_len + self.max_mem_len = max_mem_len + self.shift_mem_down = shift_mem_down + + self.token_emb = nn.Embedding(num_tokens, emb_dim) + self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity() + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.init_() + + self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t() + + # memory tokens (like [cls]) from Memory Transformers paper + num_memory_tokens = default(num_memory_tokens, 0) + self.num_memory_tokens = num_memory_tokens + if num_memory_tokens > 0: + self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim)) + + def init_(self): + nn.init.kaiming_normal_(self.token_emb.weight) + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_hiddens=False, + return_attn=False, + mems=None, + use_cache=False, + **kwargs + ): + b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens + x = self.token_emb(x) + x = x + self.pos_emb(x) + x = self.emb_dropout(x) + + x = self.project_emb(x) + + if num_mem > 0: + mem = repeat(self.memory_tokens, 'n d -> b n d', b=b) + x = torch.cat((mem, x), dim=1) + + # auto-handle masking after appending memory tokens + if exists(mask): + mask = F.pad(mask, (num_mem, 0), value=True) + + if self.shift_mem_down and exists(mems): + mems_l, mems_r = mems[:self.shift_mem_down], mems[self.shift_mem_down:] + mems = [*mems_r, *mems_l] + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + mem, x = x[:, :num_mem], x[:, num_mem:] + + out = self.to_logits(x) if not return_embeddings else x + + if return_hiddens: + hiddens = intermediates.hiddens + return out, hiddens + + res = [out] + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + res.append(attn_maps) + if use_cache: + res.append(intermediates.past_key_values) + + if len(res) > 1: + return tuple(res) + return res[0] + + +class ContinuousTransformerWrapper(nn.Module): + def __init__( + self, + *, + max_seq_len, + attn_layers, + dim_in=None, + dim_out=None, + emb_dim=None, + emb_dropout=0., + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + + self.max_seq_len = max_seq_len + + self.pos_emb = AbsolutePositionalEmbedding(dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_in = nn.Linear(dim_in, dim) if exists(dim_in) else nn.Identity() + + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.project_out = nn.Linear(dim, dim_out) if exists(dim_out) else nn.Identity() + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_attn=False, + mems=None, + use_cache=False, + **kwargs + ): + b, n, _, device = *x.shape, x.device + + x = self.project_in(x) + x = x + self.pos_emb(x) + x = self.emb_dropout(x) + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + out = self.project_out(x) if not return_embeddings else x + + res = [out] + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + res.append(attn_maps) + if use_cache: + res.append(intermediates.past_key_values) + + if len(res) > 1: + return tuple(res) + return res[0] + diff --git a/tortoise/read.py b/tortoise/read.py new file mode 100644 index 0000000000000000000000000000000000000000..9ee9ad6e1c79d4a1953f7f80fcb511f8296554ab --- /dev/null +++ b/tortoise/read.py @@ -0,0 +1,77 @@ +import argparse +import os + +import torch +import torchaudio + +from api import TextToSpeech +from tortoise.utils.audio import load_audio, get_voices, load_voices + + +def split_and_recombine_text(texts, desired_length=200, max_len=300): + # TODO: also split across '!' and '?'. Attempt to keep quotations together. + texts = [s.strip() + "." for s in texts.split('.')] + + i = 0 + while i < len(texts): + ltxt = texts[i] + if len(ltxt) >= desired_length or i == len(texts)-1: + i += 1 + continue + if len(ltxt) + len(texts[i+1]) > max_len: + i += 1 + continue + texts[i] = f'{ltxt} {texts[i+1]}' + texts.pop(i+1) + return texts + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="tortoise/data/riding_hood.txt") + parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' + 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat') + parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/') + parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard') + parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None) + parser.add_argument('--voice_diversity_intelligibility_slider', type=float, + help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility', + default=.5) + parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this' + 'should only be specified if you have custom checkpoints.', default='.models') + args = parser.parse_args() + tts = TextToSpeech(models_dir=args.model_dir) + + outpath = args.output_path + selected_voices = args.voice.split(',') + regenerate = args.regenerate + if regenerate is not None: + regenerate = [int(e) for e in regenerate.split(',')] + + for selected_voice in selected_voices: + voice_outpath = os.path.join(outpath, selected_voice) + os.makedirs(voice_outpath, exist_ok=True) + + with open(args.textfile, 'r', encoding='utf-8') as f: + text = ''.join([l for l in f.readlines()]) + texts = split_and_recombine_text(text) + + if '&' in selected_voice: + voice_sel = selected_voice.split('&') + else: + voice_sel = [selected_voice] + + voice_samples, conditioning_latents = load_voices(voice_sel) + all_parts = [] + for j, text in enumerate(texts): + if regenerate is not None and j not in regenerate: + all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000)) + continue + gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, + preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider) + gen = gen.squeeze(0).cpu() + torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000) + all_parts.append(gen) + full_audio = torch.cat(all_parts, dim=-1) + torchaudio.save(os.path.join(voice_outpath, 'combined.wav'), full_audio, 24000) + diff --git a/tortoise/utils/__init__.py b/tortoise/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..e402910c4b3dcafac82f77740256873324ff735d --- /dev/null +++ b/tortoise/utils/audio.py @@ -0,0 +1,179 @@ +import os +from glob import glob + +import librosa +import torch +import torchaudio +import numpy as np +from scipy.io.wavfile import read + +from tortoise.utils.stft import STFT + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + if data.dtype == np.int32: + norm_fix = 2 ** 31 + elif data.dtype == np.int16: + norm_fix = 2 ** 15 + elif data.dtype == np.float16 or data.dtype == np.float32: + norm_fix = 1. + else: + raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") + return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate) + + +def load_audio(audiopath, sampling_rate): + if audiopath[-4:] == '.wav': + audio, lsr = load_wav_to_torch(audiopath) + elif audiopath[-4:] == '.mp3': + audio, lsr = librosa.load(audiopath, sr=sampling_rate) + audio = torch.FloatTensor(audio) + + # Remove any channel data. + if len(audio.shape) > 1: + if audio.shape[0] < 5: + audio = audio[0] + else: + assert audio.shape[1] < 5 + audio = audio[:, 0] + + if lsr != sampling_rate: + audio = torchaudio.functional.resample(audio, lsr, sampling_rate) + + # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. + # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. + if torch.any(audio > 2) or not torch.any(audio < 0): + print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + audio.clip_(-1, 1) + + return audio.unsqueeze(0) + + +TACOTRON_MEL_MAX = 2.3143386840820312 +TACOTRON_MEL_MIN = -11.512925148010254 + + +def denormalize_tacotron_mel(norm_mel): + return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN + + +def normalize_tacotron_mel(mel): + return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def get_voices(): + subs = os.listdir('tortoise/voices') + voices = {} + for sub in subs: + subj = os.path.join('tortoise/voices', sub) + if os.path.isdir(subj): + voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth')) + return voices + + +def load_voice(voice): + if voice == 'random': + return None, None + + voices = get_voices() + paths = voices[voice] + if len(paths) == 1 and paths[0].endswith('.pth'): + return None, torch.load(paths[0]) + else: + conds = [] + for cond_path in paths: + c = load_audio(cond_path, 22050) + conds.append(c) + return conds, None + + +def load_voices(voices): + latents = [] + clips = [] + for voice in voices: + if voice == 'random': + print("Cannot combine a random voice with a non-random voice. Just using a random voice.") + return None, None + clip, latent = load_voice(voice) + if latent is None: + assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + clips.extend(clip) + elif voice is None: + assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + latents.append(latent) + if len(latents) == 0: + return clips, None + else: + latents = torch.stack(latents, dim=0) + return None, latents.mean(dim=0) + + +class TacotronSTFT(torch.nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, + n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, + mel_fmax=8000.0): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + from librosa.filters import mel as librosa_mel_fn + mel_basis = librosa_mel_fn( + sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer('mel_basis', mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert(torch.min(y.data) >= -10) + assert(torch.max(y.data) <= 10) + y = torch.clip(y, min=-1, max=1) + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output + + +def wav_to_univnet_mel(wav, do_normalization=False): + stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000) + stft = stft.cuda() + mel = stft.mel_spectrogram(wav) + if do_normalization: + mel = normalize_tacotron_mel(mel) + return mel \ No newline at end of file diff --git a/tortoise/utils/diffusion.py b/tortoise/utils/diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..e877ff22de75c407f067ff2a6280e912eebf7a84 --- /dev/null +++ b/tortoise/utils/diffusion.py @@ -0,0 +1,1250 @@ +""" +This is an almost carbon copy of gaussian_diffusion.py from OpenAI's ImprovedDiffusion repo, which itself: + +This code started out as a PyTorch port of Ho et al's diffusion models: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py + +Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules. +""" + +import enum +import math + +import numpy as np +import torch +import torch as th +from tqdm import tqdm + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, th.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for th.exp(). + logvar1, logvar2 = [ + x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + th.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * th.exp(-logvar2) + ) + + +def approx_standard_normal_cdf(x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. + """ + return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) + + +def discretized_gaussian_log_likelihood(x, *, means, log_scales): + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. + + :param x: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + :param means: the Gaussian mean Tensor. + :param log_scales: the Gaussian log stddev Tensor. + :return: a tensor like x of log probabilities (in nats). + """ + assert x.shape == means.shape == log_scales.shape + centered_x = x - means + inv_stdv = th.exp(-log_scales) + plus_in = inv_stdv * (centered_x + 1.0 / 255.0) + cdf_plus = approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - 1.0 / 255.0) + cdf_min = approx_standard_normal_cdf(min_in) + log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) + log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = th.where( + x < -0.999, + log_cdf_plus, + th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), + ) + assert log_probs.shape == x.shape + return log_probs + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +class ModelMeanType(enum.Enum): + """ + Which type of output the model predicts. + """ + + PREVIOUS_X = 'previous_x' # the model predicts x_{t-1} + START_X = 'start_x' # the model predicts x_0 + EPSILON = 'epsilon' # the model predicts epsilon + + +class ModelVarType(enum.Enum): + """ + What is used as the model's output variance. + + The LEARNED_RANGE option has been added to allow the model to predict + values between FIXED_SMALL and FIXED_LARGE, making its job easier. + """ + + LEARNED = 'learned' + FIXED_SMALL = 'fixed_small' + FIXED_LARGE = 'fixed_large' + LEARNED_RANGE = 'learned_range' + + +class LossType(enum.Enum): + MSE = 'mse' # use raw MSE loss (and KL when learning variances) + RESCALED_MSE = 'rescaled_mse' # use raw MSE loss (with RESCALED_KL when learning variances) + KL = 'kl' # use the variational lower-bound + RESCALED_KL = 'rescaled_kl' # like KL, but rescale to estimate the full VLB + + def is_vb(self): + return self == LossType.KL or self == LossType.RESCALED_KL + + +class GaussianDiffusion: + """ + Utilities for training and sampling diffusion models. + + Ported directly from here, and then adapted over time to further experimentation. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 + + :param betas: a 1-D numpy array of betas for each diffusion timestep, + starting at T and going to 1. + :param model_mean_type: a ModelMeanType determining what the model outputs. + :param model_var_type: a ModelVarType determining how variance is output. + :param loss_type: a LossType determining the loss function to use. + :param rescale_timesteps: if True, pass floating point timesteps into the + model so that they are always scaled like in the + original paper (0 to 1000). + """ + + def __init__( + self, + *, + betas, + model_mean_type, + model_var_type, + loss_type, + rescale_timesteps=False, + conditioning_free=False, + conditioning_free_k=1, + ramp_conditioning_free=True, + ): + self.model_mean_type = ModelMeanType(model_mean_type) + self.model_var_type = ModelVarType(model_var_type) + self.loss_type = LossType(loss_type) + self.rescale_timesteps = rescale_timesteps + self.conditioning_free = conditioning_free + self.conditioning_free_k = conditioning_free_k + self.ramp_conditioning_free = ramp_conditioning_free + + # Use float64 for accuracy. + betas = np.array(betas, dtype=np.float64) + self.betas = betas + assert len(betas.shape) == 1, "betas must be 1-D" + assert (betas > 0).all() and (betas <= 1).all() + + self.num_timesteps = int(betas.shape[0]) + + alphas = 1.0 - betas + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) + self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) + assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) + self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) + self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) + self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + self.posterior_variance = ( + betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + # log calculation clipped because the posterior variance is 0 at the + # beginning of the diffusion chain. + self.posterior_log_variance_clipped = np.log( + np.append(self.posterior_variance[1], self.posterior_variance[1:]) + ) + self.posterior_mean_coef1 = ( + betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + self.posterior_mean_coef2 = ( + (1.0 - self.alphas_cumprod_prev) + * np.sqrt(alphas) + / (1.0 - self.alphas_cumprod) + ) + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + ) + variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = _extract_into_tensor( + self.log_one_minus_alphas_cumprod, t, x_start.shape + ) + return mean, variance, log_variance + + def q_sample(self, x_start, t, noise=None): + """ + Diffuse the data for a given number of diffusion steps. + + In other words, sample from q(x_t | x_0). + + :param x_start: the initial data batch. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :param noise: if specified, the split-out normal noise. + :return: A noisy version of x_start. + """ + if noise is None: + noise = th.randn_like(x_start) + assert noise.shape == x_start.shape + return ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) + * noise + ) + + def q_posterior_mean_variance(self, x_start, x_t, t): + """ + Compute the mean and variance of the diffusion posterior: + + q(x_{t-1} | x_t, x_0) + + """ + assert x_start.shape == x_t.shape + posterior_mean = ( + _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape + ) + assert ( + posterior_mean.shape[0] + == posterior_variance.shape[0] + == posterior_log_variance_clipped.shape[0] + == x_start.shape[0] + ) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance( + self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None + ): + """ + Apply the model to get p(x_{t-1} | x_t), as well as a prediction of + the initial x, x_0. + + :param model: the model, which takes a signal and a batch of timesteps + as input. + :param x: the [N x C x ...] tensor at time t. + :param t: a 1-D Tensor of timesteps. + :param clip_denoised: if True, clip the denoised signal into [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. Applies before + clip_denoised. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict with the following keys: + - 'mean': the model mean output. + - 'variance': the model variance output. + - 'log_variance': the log of 'variance'. + - 'pred_xstart': the prediction for x_0. + """ + if model_kwargs is None: + model_kwargs = {} + + B, C = x.shape[:2] + assert t.shape == (B,) + model_output = model(x, self._scale_timesteps(t), **model_kwargs) + if self.conditioning_free: + model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs) + + if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: + assert model_output.shape == (B, C * 2, *x.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + if self.conditioning_free: + model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1) + if self.model_var_type == ModelVarType.LEARNED: + model_log_variance = model_var_values + model_variance = th.exp(model_log_variance) + else: + min_log = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x.shape + ) + max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) + # The model_var_values is [-1, 1] for [min_var, max_var]. + frac = (model_var_values + 1) / 2 + model_log_variance = frac * max_log + (1 - frac) * min_log + model_variance = th.exp(model_log_variance) + else: + model_variance, model_log_variance = { + # for fixedlarge, we set the initial (log-)variance like so + # to get a better decoder log likelihood. + ModelVarType.FIXED_LARGE: ( + np.append(self.posterior_variance[1], self.betas[1:]), + np.log(np.append(self.posterior_variance[1], self.betas[1:])), + ), + ModelVarType.FIXED_SMALL: ( + self.posterior_variance, + self.posterior_log_variance_clipped, + ), + }[self.model_var_type] + model_variance = _extract_into_tensor(model_variance, t, x.shape) + model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) + + if self.conditioning_free: + if self.ramp_conditioning_free: + assert t.shape[0] == 1 # This should only be used in inference. + cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps) + else: + cfk = self.conditioning_free_k + model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning + + def process_xstart(x): + if denoised_fn is not None: + x = denoised_fn(x) + if clip_denoised: + return x.clamp(-1, 1) + return x + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + pred_xstart = process_xstart( + self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output) + ) + model_mean = model_output + elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: + if self.model_mean_type == ModelMeanType.START_X: + pred_xstart = process_xstart(model_output) + else: + pred_xstart = process_xstart( + self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) + ) + model_mean, _, _ = self.q_posterior_mean_variance( + x_start=pred_xstart, x_t=x, t=t + ) + else: + raise NotImplementedError(self.model_mean_type) + + assert ( + model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape + ) + return { + "mean": model_mean, + "variance": model_variance, + "log_variance": model_log_variance, + "pred_xstart": pred_xstart, + } + + def _predict_xstart_from_eps(self, x_t, t, eps): + assert x_t.shape == eps.shape + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps + ) + + def _predict_xstart_from_xprev(self, x_t, t, xprev): + assert x_t.shape == xprev.shape + return ( # (xprev - coef2*x_t) / coef1 + _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev + - _extract_into_tensor( + self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape + ) + * x_t + ) + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - pred_xstart + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _scale_timesteps(self, t): + if self.rescale_timesteps: + return t.float() * (1000.0 / self.num_timesteps) + return t + + def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) + new_mean = ( + p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() + ) + return new_mean + + def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn( + x, self._scale_timesteps(t), **model_kwargs + ) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance( + x_start=out["pred_xstart"], x_t=x, t=t + ) + return out + + def p_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + noise = th.randn_like(x) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean( + cond_fn, out, x, t, model_kwargs=model_kwargs + ) + sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def p_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + for i in tqdm(indices, disable=not progress): + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.p_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = ( + eta + * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) + * th.sqrt(1 - alpha_bar / alpha_bar_prev) + ) + # Equation 12. + noise = th.randn_like(x) + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_prev) + + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps + ) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def ddim_reverse_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x + - out["pred_xstart"] + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) + + # Equation 12. reversed + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_next) + + th.sqrt(1 - alpha_bar_next) * eps + ) + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices, disable=not progress) + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.ddim_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + eta=eta, + ) + yield out + img = out["sample"] + + def _vb_terms_bpd( + self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None + ): + """ + Get a term for the variational lower-bound. + + The resulting units are bits (rather than nats, as one might expect). + This allows for comparison to other papers. + + :return: a dict with the following keys: + - 'output': a shape [N] tensor of NLLs or KLs. + - 'pred_xstart': the x_0 predictions. + """ + true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + ) + out = self.p_mean_variance( + model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs + ) + kl = normal_kl( + true_mean, true_log_variance_clipped, out["mean"], out["log_variance"] + ) + kl = mean_flat(kl) / np.log(2.0) + + decoder_nll = -discretized_gaussian_log_likelihood( + x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] + ) + assert decoder_nll.shape == x_start.shape + decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + + # At the first timestep return the decoder NLL, + # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) + output = th.where((t == 0), decoder_nll, kl) + return {"output": output, "pred_xstart": out["pred_xstart"]} + + def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + + terms = {} + + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + # TODO: support multiple model outputs for this mode. + terms["loss"] = self._vb_terms_bpd( + model=model, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + model_kwargs=model_kwargs, + )["output"] + if self.loss_type == LossType.RESCALED_KL: + terms["loss"] *= self.num_timesteps + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs) + if isinstance(model_outputs, tuple): + model_output = model_outputs[0] + terms['extra_outputs'] = model_outputs[1:] + else: + model_output = model_outputs + + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C * 2, *x_t.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + target = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0] + x_start_pred = torch.zeros(x_start) # Not supported. + elif self.model_mean_type == ModelMeanType.START_X: + target = x_start + x_start_pred = model_output + elif self.model_mean_type == ModelMeanType.EPSILON: + target = noise + x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) + else: + raise NotImplementedError(self.model_mean_type) + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + terms["x_start_predicted"] = x_start_pred + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def autoregressive_training_losses(self, model, x_start, t, model_output_keys, gd_out_key, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + terms = {} + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + assert False # not currently supported for this type of diffusion. + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) + terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) + model_output = terms[gd_out_key] + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C, 2, *x_t.shape[2:]) + model_output, model_var_values = model_output[:, :, 0], model_output[:, :, 1] + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + target = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0] + x_start_pred = torch.zeros(x_start) # Not supported. + elif self.model_mean_type == ModelMeanType.START_X: + target = x_start + x_start_pred = model_output + elif self.model_mean_type == ModelMeanType.EPSILON: + target = noise + x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) + else: + raise NotImplementedError(self.model_mean_type) + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + terms["x_start_predicted"] = x_start_pred + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + + This term can't be optimized, as it only depends on the encoder. + + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl( + mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 + ) + return mean_flat(kl_prior) / np.log(2.0) + + def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): + """ + Compute the entire variational lower-bound, measured in bits-per-dim, + as well as other related quantities. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param clip_denoised: if True, clip denoised samples. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + + :return: a dict containing the following keys: + - total_bpd: the total variational lower-bound, per batch element. + - prior_bpd: the prior term in the lower-bound. + - vb: an [N x T] tensor of terms in the lower-bound. + - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. + - mse: an [N x T] tensor of epsilon MSEs for each timestep. + """ + device = x_start.device + batch_size = x_start.shape[0] + + vb = [] + xstart_mse = [] + mse = [] + for t in list(range(self.num_timesteps))[::-1]: + t_batch = th.tensor([t] * batch_size, device=device) + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) + # Calculate VLB term at the current timestep + with th.no_grad(): + out = self._vb_terms_bpd( + model, + x_start=x_start, + x_t=x_t, + t=t_batch, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + ) + vb.append(out["output"]) + xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) + eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) + mse.append(mean_flat((eps - noise) ** 2)) + + vb = th.stack(vb, dim=1) + xstart_mse = th.stack(xstart_mse, dim=1) + mse = th.stack(mse, dim=1) + + prior_bpd = self._prior_bpd(x_start) + total_bpd = vb.sum(dim=1) + prior_bpd + return { + "total_bpd": total_bpd, + "prior_bpd": prior_bpd, + "vb": vb, + "xstart_mse": xstart_mse, + "mse": mse, + } + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +class SpacedDiffusion(GaussianDiffusion): + """ + A diffusion process which can skip steps in a base diffusion process. + + :param use_timesteps: a collection (sequence or set) of timesteps from the + original diffusion process to retain. + :param kwargs: the kwargs to create the base diffusion process. + """ + + def __init__(self, use_timesteps, **kwargs): + self.use_timesteps = set(use_timesteps) + self.timestep_map = [] + self.original_num_steps = len(kwargs["betas"]) + + base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa + last_alpha_cumprod = 1.0 + new_betas = [] + for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): + if i in self.use_timesteps: + new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) + last_alpha_cumprod = alpha_cumprod + self.timestep_map.append(i) + kwargs["betas"] = np.array(new_betas) + super().__init__(**kwargs) + + def p_mean_variance( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) + + def training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().training_losses(self._wrap_model(model), *args, **kwargs) + + def autoregressive_training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs) + + def condition_mean(self, cond_fn, *args, **kwargs): + return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) + + def condition_score(self, cond_fn, *args, **kwargs): + return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) + + def _wrap_model(self, model, autoregressive=False): + if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel): + return model + mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel + return mod( + model, self.timestep_map, self.rescale_timesteps, self.original_num_steps + ) + + def _scale_timesteps(self, t): + # Scaling is done by the wrapped model. + return t + + +def space_timesteps(num_timesteps, section_counts): + """ + Create a list of timesteps to use from an original diffusion process, + given the number of timesteps we want to take from equally-sized portions + of the original process. + + For example, if there's 300 timesteps and the section counts are [10,15,20] + then the first 100 timesteps are strided to be 10 timesteps, the second 100 + are strided to be 15 timesteps, and the final 100 are strided to be 20. + + If the stride is a string starting with "ddim", then the fixed striding + from the DDIM paper is used, and only one section is allowed. + + :param num_timesteps: the number of diffusion steps in the original + process to divide up. + :param section_counts: either a list of numbers, or a string containing + comma-separated numbers, indicating the step count + per section. As a special case, use "ddimN" where N + is a number of steps to use the striding from the + DDIM paper. + :return: a set of diffusion steps from the original process to use. + """ + if isinstance(section_counts, str): + if section_counts.startswith("ddim"): + desired_count = int(section_counts[len("ddim") :]) + for i in range(1, num_timesteps): + if len(range(0, num_timesteps, i)) == desired_count: + return set(range(0, num_timesteps, i)) + raise ValueError( + f"cannot create exactly {num_timesteps} steps with an integer stride" + ) + section_counts = [int(x) for x in section_counts.split(",")] + size_per = num_timesteps // len(section_counts) + extra = num_timesteps % len(section_counts) + start_idx = 0 + all_steps = [] + for i, section_count in enumerate(section_counts): + size = size_per + (1 if i < extra else 0) + if size < section_count: + raise ValueError( + f"cannot divide section of {size} steps into {section_count}" + ) + if section_count <= 1: + frac_stride = 1 + else: + frac_stride = (size - 1) / (section_count - 1) + cur_idx = 0.0 + taken_steps = [] + for _ in range(section_count): + taken_steps.append(start_idx + round(cur_idx)) + cur_idx += frac_stride + all_steps += taken_steps + start_idx += size + return set(all_steps) + + +class _WrappedModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, new_ts, **kwargs) + + +class _WrappedAutoregressiveModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, x0, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, x0, new_ts, **kwargs) + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) \ No newline at end of file diff --git a/tortoise/utils/samples_generator.py b/tortoise/utils/samples_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..61d30141e1fe652fe1abcad61e5d21db11f88298 --- /dev/null +++ b/tortoise/utils/samples_generator.py @@ -0,0 +1,51 @@ +import os + +# This script builds the sample webpage. + +if __name__ == '__main__': + result = "These words were never spoken.

Handpicked results

" + for fv in os.listdir('../../results/favorites'): + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/favorites/{fv}' + result = result + f'
\n' + + result = result + "

Handpicked longform result:

" + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/favorite_riding_hood.mp3' + result = result + f'
\n' + + result = result + "

Compared to Tacotron2 (with the LJSpeech voice):

" + for k in range(2,5,1): + url1 = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/tacotron_comparison/{k}-tacotron2.mp3' + url2 = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/tacotron_comparison/{k}-tortoise.mp3' + result = result + f'' \ + f'' + result = result + "
Tacotron2+WaveglowTorToiSe

\n

\n
" + + result = result + "

Various spoken texts for all voices:

" + voices = ['angie', 'daniel', 'deniro', 'emma', 'freeman', 'geralt', 'halle', 'jlaw', 'lj', 'myself', + 'pat', 'snakes', 'tom', 'train_atkins', 'train_dotrice', 'train_kennard', 'weaver', 'william'] + lines = ['' + ''.join([f'' for v in voices])] + line = f'' + for v in voices: + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/voices/{v}/1.wav' + line = line + f'' + line = line + "" + lines.append(line) + for txt in os.listdir('../../results/various/'): + if 'desktop' in txt: + continue + line = f'' + for v in voices: + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/various/{txt}/{v}.mp3' + line = line + f'' + line = line + "" + lines.append(line) + result = result + '\n'.join(lines) + "
text{v}
reference clip
{txt}
" + + result = result + "

Longform result for all voices:

" + for lf in os.listdir('../../results/riding_hood'): + url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/riding_hood/{lf}' + result = result + f'
\n' + + result = result + "" + with open('result.html', 'w', encoding='utf-8') as f: + f.write(result) diff --git a/tortoise/utils/stft.py b/tortoise/utils/stft.py new file mode 100644 index 0000000000000000000000000000000000000000..f54eb968225cfe5928cca6d7686abbcc3728a674 --- /dev/null +++ b/tortoise/utils/stft.py @@ -0,0 +1,193 @@ +""" +BSD 3-Clause License + +Copyright (c) 2017, Prem Seetharaman +All rights reserved. + +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +import librosa.util as librosa_util + + +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm)**2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + + +class STFT(torch.nn.Module): + """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" + def __init__(self, filter_length=800, hop_length=200, win_length=800, + window='hann'): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])]) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + + if window is not None: + assert(filter_length >= win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float()) + self.register_buffer('inverse_basis', inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode='reflect') + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part**2 + imag_part**2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data)) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + Variable(self.inverse_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False) + window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] + inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction \ No newline at end of file diff --git a/tortoise/utils/tokenizer.py b/tortoise/utils/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2f36a064f71388645b0a2f4a7a60eff983c683de --- /dev/null +++ b/tortoise/utils/tokenizer.py @@ -0,0 +1,187 @@ +import re + +import inflect +import torch +from tokenizers import Tokenizer + + +# Regular expression matching whitespace: +from unidecode import unidecode + +_whitespace_re = re.compile(r'\s+') + + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + text = text.replace('"', '') + return text + +def lev_distance(s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances = distances_ + return distances[-1] + +class VoiceBpeTokenizer: + def __init__(self, vocab_file='tortoise/data/tokenizer.json'): + if vocab_file is not None: + self.tokenizer = Tokenizer.from_file(vocab_file) + + def preprocess_text(self, txt): + txt = english_cleaners(txt) + return txt + + def encode(self, txt): + txt = self.preprocess_text(txt) + txt = txt.replace(' ', '[SPACE]') + return self.tokenizer.encode(txt).ids + + def decode(self, seq): + if isinstance(seq, torch.Tensor): + seq = seq.cpu().numpy() + txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '') + txt = txt.replace('[SPACE]', ' ') + txt = txt.replace('[STOP]', '') + txt = txt.replace('[UNK]', '') + return txt \ No newline at end of file diff --git a/tortoise/utils/typical_sampling.py b/tortoise/utils/typical_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..ff6bf487947e88a55fa45f2ffec1b9540df1d4fd --- /dev/null +++ b/tortoise/utils/typical_sampling.py @@ -0,0 +1,33 @@ +import torch +from transformers import LogitsWarper + + +class TypicalLogitsWarper(LogitsWarper): + def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + self.filter_value = filter_value + self.mass = mass + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + # calculate entropy + normalized = torch.nn.functional.log_softmax(scores, dim=-1) + p = torch.exp(normalized) + ent = -(normalized * p).nansum(-1, keepdim=True) + + # shift and sort + shifted_scores = torch.abs((-normalized) - ent) + sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False) + sorted_logits = scores.gather(-1, sorted_indices) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Remove tokens with cumulative mass above the threshold + last_ind = (cumulative_probs < self.mass).sum(dim=1) + last_ind[last_ind < 0] = 0 + sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1)) + if self.min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores \ No newline at end of file diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py new file mode 100644 index 0000000000000000000000000000000000000000..fe4a3fbd0d8fcbcc5cc4eddfbb864d833cf69f63 --- /dev/null +++ b/tortoise/utils/wav2vec_alignment.py @@ -0,0 +1,145 @@ +import re + +import torch +import torchaudio +from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor + +from tortoise.utils.audio import load_audio + + +def max_alignment(s1, s2, skip_character='~', record={}): + """ + A clever function that aligns s1 to s2 as best it can. Wherever a character from s1 is not found in s2, a '~' is + used to replace that character. + + Finally got to use my DP skills! + """ + assert skip_character not in s1, f"Found the skip character {skip_character} in the provided string, {s1}" + if len(s1) == 0: + return '' + if len(s2) == 0: + return skip_character * len(s1) + if s1 == s2: + return s1 + if s1[0] == s2[0]: + return s1[0] + max_alignment(s1[1:], s2[1:], skip_character, record) + + take_s1_key = (len(s1), len(s2) - 1) + if take_s1_key in record: + take_s1, take_s1_score = record[take_s1_key] + else: + take_s1 = max_alignment(s1, s2[1:], skip_character, record) + take_s1_score = len(take_s1.replace(skip_character, '')) + record[take_s1_key] = (take_s1, take_s1_score) + + take_s2_key = (len(s1) - 1, len(s2)) + if take_s2_key in record: + take_s2, take_s2_score = record[take_s2_key] + else: + take_s2 = max_alignment(s1[1:], s2, skip_character, record) + take_s2_score = len(take_s2.replace(skip_character, '')) + record[take_s2_key] = (take_s2, take_s2_score) + + return take_s1 if take_s1_score > take_s2_score else skip_character + take_s2 + + +class Wav2VecAlignment: + """ + Uses wav2vec2 to perform audio<->text alignment. + """ + def __init__(self): + self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu() + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h") + self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron_symbols') + + def align(self, audio, expected_text, audio_sample_rate=24000): + orig_len = audio.shape[-1] + + with torch.no_grad(): + self.model = self.model.cuda() + audio = audio.to('cuda') + audio = torchaudio.functional.resample(audio, audio_sample_rate, 16000) + clip_norm = (audio - audio.mean()) / torch.sqrt(audio.var() + 1e-7) + logits = self.model(clip_norm).logits + self.model = self.model.cpu() + + logits = logits[0] + pred_string = self.tokenizer.decode(logits.argmax(-1).tolist()) + + fixed_expectation = max_alignment(expected_text, pred_string) + w2v_compression = orig_len // logits.shape[0] + expected_tokens = self.tokenizer.encode(fixed_expectation) + expected_chars = list(fixed_expectation) + if len(expected_tokens) == 1: + return [0] # The alignment is simple; there is only one token. + expected_tokens.pop(0) # The first token is a given. + expected_chars.pop(0) + + alignments = [0] + def pop_till_you_win(): + if len(expected_tokens) == 0: + return None + popped = expected_tokens.pop(0) + popped_char = expected_chars.pop(0) + while popped_char == '~': + alignments.append(-1) + if len(expected_tokens) == 0: + return None + popped = expected_tokens.pop(0) + popped_char = expected_chars.pop(0) + return popped + + next_expected_token = pop_till_you_win() + for i, logit in enumerate(logits): + top = logit.argmax() + if next_expected_token == top: + alignments.append(i * w2v_compression) + if len(expected_tokens) > 0: + next_expected_token = pop_till_you_win() + else: + break + + pop_till_you_win() + assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks." + + # Now fix up alignments. Anything with -1 should be interpolated. + alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable. + for i in range(len(alignments)): + if alignments[i] == -1: + for j in range(i+1, len(alignments)): + if alignments[j] != -1: + next_found_token = j + break + for j in range(i, next_found_token): + gap = alignments[next_found_token] - alignments[i-1] + alignments[j] = (j-i+1) * gap // (next_found_token-i+1) + alignments[i-1] + + return alignments[:-1] + + def redact(self, audio, expected_text, audio_sample_rate=24000): + if '[' not in expected_text: + return audio + splitted = expected_text.split('[') + fully_split = [splitted[0]] + for spl in splitted[1:]: + assert ']' in spl, 'Every "[" character must be paired with a "]" with no nesting.' + fully_split.extend(spl.split(']')) + + # At this point, fully_split is a list of strings, with every other string being something that should be redacted. + non_redacted_intervals = [] + last_point = 0 + for i in range(len(fully_split)): + if i % 2 == 0: + end_interval = max(0, last_point + len(fully_split[i]) - 1) + non_redacted_intervals.append((last_point, end_interval)) + last_point += len(fully_split[i]) + + bare_text = ''.join(fully_split) + alignments = self.align(audio, bare_text, audio_sample_rate) + + output_audio = [] + for nri in non_redacted_intervals: + start, stop = nri + output_audio.append(audio[:, alignments[start]:alignments[stop]]) + return torch.cat(output_audio, dim=-1) + diff --git a/tortoise/voices/.gitattributes b/tortoise/voices/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..28b59546b1e2607855f07463d38b51ec0968c2ec --- /dev/null +++ b/tortoise/voices/.gitattributes @@ -0,0 +1,23 @@ +halle filter=lfs diff=lfs merge=lfs -text +train_dotrice filter=lfs diff=lfs merge=lfs -text +train_kennard filter=lfs diff=lfs merge=lfs -text +train_atkins filter=lfs diff=lfs merge=lfs -text +daniel filter=lfs diff=lfs merge=lfs -text +geralt filter=lfs diff=lfs merge=lfs -text +pat filter=lfs diff=lfs merge=lfs -text +snakes filter=lfs diff=lfs merge=lfs -text +train_grace filter=lfs diff=lfs merge=lfs -text +angie filter=lfs diff=lfs merge=lfs -text +deniro filter=lfs diff=lfs merge=lfs -text +emma filter=lfs diff=lfs merge=lfs -text +freeman filter=lfs diff=lfs merge=lfs -text +jlaw filter=lfs diff=lfs merge=lfs -text +lj filter=lfs diff=lfs merge=lfs -text +mol filter=lfs diff=lfs merge=lfs -text +myself filter=lfs diff=lfs merge=lfs -text +pat2 filter=lfs diff=lfs merge=lfs -text +tim_reynolds filter=lfs diff=lfs merge=lfs -text +tom filter=lfs diff=lfs merge=lfs -text +train_lescault filter=lfs diff=lfs merge=lfs -text +weaver filter=lfs diff=lfs merge=lfs -text +william filter=lfs diff=lfs merge=lfs -text diff --git a/tortoise/voices/angie/1.wav b/tortoise/voices/angie/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..6f8480a81995bd81880933e4429ad031004c9766 Binary files /dev/null and b/tortoise/voices/angie/1.wav differ diff --git a/tortoise/voices/angie/3.wav b/tortoise/voices/angie/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..a5e3fc1fe8fd7b57179e4b06cd0414bd7b46ce57 Binary files /dev/null and b/tortoise/voices/angie/3.wav differ diff --git a/tortoise/voices/daniel/1.wav b/tortoise/voices/daniel/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..eb0881ef39ff4246ef08f38e805d17ef16295d61 Binary files /dev/null and b/tortoise/voices/daniel/1.wav differ diff --git a/tortoise/voices/daniel/2.wav b/tortoise/voices/daniel/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..677435944afc62d0592a1bf945e6e7d689343c14 Binary files /dev/null and b/tortoise/voices/daniel/2.wav differ diff --git a/tortoise/voices/daniel/3.wav b/tortoise/voices/daniel/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..9c1d7e928ee278396756d5c080c426e6abaaf403 Binary files /dev/null and b/tortoise/voices/daniel/3.wav differ diff --git a/tortoise/voices/daniel/4.wav b/tortoise/voices/daniel/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..bf34df273602c8daaad1245945784f827e3ddea0 Binary files /dev/null and b/tortoise/voices/daniel/4.wav differ diff --git a/tortoise/voices/deniro/1.wav b/tortoise/voices/deniro/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..391a4878e9f76a84b5f917de20fc8bd303071ad5 Binary files /dev/null and b/tortoise/voices/deniro/1.wav differ diff --git a/tortoise/voices/deniro/3.wav b/tortoise/voices/deniro/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..bbb5737b06b5bb75f76d105cadc3907575c5a880 Binary files /dev/null and b/tortoise/voices/deniro/3.wav differ diff --git a/tortoise/voices/deniro/4.wav b/tortoise/voices/deniro/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..9847b6edecbed48746eecfb7ce24f187817525a8 Binary files /dev/null and b/tortoise/voices/deniro/4.wav differ diff --git a/tortoise/voices/emma/1.wav b/tortoise/voices/emma/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..5acab208fc1e9bcc6281fad4e8feb7996ee604a2 Binary files /dev/null and b/tortoise/voices/emma/1.wav differ diff --git a/tortoise/voices/emma/2.wav b/tortoise/voices/emma/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..ca7bfe92d369763ade070af32f43d487cb5ba674 Binary files /dev/null and b/tortoise/voices/emma/2.wav differ diff --git a/tortoise/voices/emma/3.wav b/tortoise/voices/emma/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..5b065fce7b70073d713e7b40856d1ba3b0f6c919 Binary files /dev/null and b/tortoise/voices/emma/3.wav differ diff --git a/tortoise/voices/freeman/1.wav b/tortoise/voices/freeman/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..0b6941e43e50025d177986f7176452144fb057c1 Binary files /dev/null and b/tortoise/voices/freeman/1.wav differ diff --git a/tortoise/voices/freeman/2.wav b/tortoise/voices/freeman/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..7377fd0852bfe9f35d559c05ab5342e598e25bd2 Binary files /dev/null and b/tortoise/voices/freeman/2.wav differ diff --git a/tortoise/voices/freeman/3.wav b/tortoise/voices/freeman/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..889cee806a08efc2de73caf488b6bc4bf352f497 Binary files /dev/null and b/tortoise/voices/freeman/3.wav differ diff --git a/tortoise/voices/geralt/1.wav b/tortoise/voices/geralt/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..b263cf4908ddb8c1a8b2cde3583304e2a0623fdd Binary files /dev/null and b/tortoise/voices/geralt/1.wav differ diff --git a/tortoise/voices/geralt/2.wav b/tortoise/voices/geralt/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..953459a8c2d8e5139225c63abac101521c3e212a Binary files /dev/null and b/tortoise/voices/geralt/2.wav differ diff --git a/tortoise/voices/geralt/3.wav b/tortoise/voices/geralt/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..5a40836146bb2cab346df20a5ffb90ba9cbd63e0 Binary files /dev/null and b/tortoise/voices/geralt/3.wav differ diff --git a/tortoise/voices/halle/1.wav b/tortoise/voices/halle/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..a023dab1c784225c7501d8bf3a3c5b67d0e46109 Binary files /dev/null and b/tortoise/voices/halle/1.wav differ diff --git a/tortoise/voices/halle/2.wav b/tortoise/voices/halle/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..07f738a7fe4e37d9e12a48ebbc563eaee65a7012 Binary files /dev/null and b/tortoise/voices/halle/2.wav differ diff --git a/tortoise/voices/halle/3.wav b/tortoise/voices/halle/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..8b7914492d254157f95d0f1d00095abf6b7521c7 Binary files /dev/null and b/tortoise/voices/halle/3.wav differ diff --git a/tortoise/voices/jlaw/1.wav b/tortoise/voices/jlaw/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..e749d0e494cf771b332437b924a4af0b8b92977d Binary files /dev/null and b/tortoise/voices/jlaw/1.wav differ diff --git a/tortoise/voices/jlaw/2.wav b/tortoise/voices/jlaw/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..7dd51de3468e815baf65e343ea4362614120bd11 Binary files /dev/null and b/tortoise/voices/jlaw/2.wav differ diff --git a/tortoise/voices/jlaw/3.wav b/tortoise/voices/jlaw/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..429230f47cba63ffc0282f4af0e1e19962069c25 Binary files /dev/null and b/tortoise/voices/jlaw/3.wav differ diff --git a/tortoise/voices/jlaw/4.wav b/tortoise/voices/jlaw/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..e475993dd2ebfbe01293f8abaffba3482e7edc09 Binary files /dev/null and b/tortoise/voices/jlaw/4.wav differ diff --git a/tortoise/voices/lj/1.wav b/tortoise/voices/lj/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..5d86776a4dd406fee2cfb07f87ddf09431f075a0 Binary files /dev/null and b/tortoise/voices/lj/1.wav differ diff --git a/tortoise/voices/lj/2.wav b/tortoise/voices/lj/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..75d66e4355401924a79260b7f990145288fd0c14 Binary files /dev/null and b/tortoise/voices/lj/2.wav differ diff --git a/tortoise/voices/mol/1.wav b/tortoise/voices/mol/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..b3244a5456f94172f199d4853fffcddc9351caf3 Binary files /dev/null and b/tortoise/voices/mol/1.wav differ diff --git a/tortoise/voices/mol/2.wav b/tortoise/voices/mol/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..b6d3928e189780584870292d43f4e1d53503a2bd Binary files /dev/null and b/tortoise/voices/mol/2.wav differ diff --git a/tortoise/voices/myself/1.wav b/tortoise/voices/myself/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..83b7f804f43ca482e68c569bfafc92d5bd4121a0 Binary files /dev/null and b/tortoise/voices/myself/1.wav differ diff --git a/tortoise/voices/myself/2.wav b/tortoise/voices/myself/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..faec7ddf078e7a340cb54aa399da21c0f9b21c0d Binary files /dev/null and b/tortoise/voices/myself/2.wav differ diff --git a/tortoise/voices/myself/3.wav b/tortoise/voices/myself/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..374c799874a4d0940fe5df2bd590daf2d5244e78 Binary files /dev/null and b/tortoise/voices/myself/3.wav differ diff --git a/tortoise/voices/pat/1.wav b/tortoise/voices/pat/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..8c80c24e1bb9fe59713c9b3f7c320a8568aed7ae Binary files /dev/null and b/tortoise/voices/pat/1.wav differ diff --git a/tortoise/voices/pat/2.wav b/tortoise/voices/pat/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..5503b1c0f2fb1b327eb7b8f7e43123702f916389 Binary files /dev/null and b/tortoise/voices/pat/2.wav differ diff --git a/tortoise/voices/pat/3.wav b/tortoise/voices/pat/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..ec4e85396b5eaa59d12967212cdf621346707a37 Binary files /dev/null and b/tortoise/voices/pat/3.wav differ diff --git a/tortoise/voices/pat/4.wav b/tortoise/voices/pat/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..5949dd2d1200b8103aeaaada8262ee359cad4669 Binary files /dev/null and b/tortoise/voices/pat/4.wav differ diff --git a/tortoise/voices/pat2/00100.mp3 b/tortoise/voices/pat2/00100.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..fd50dc458de5ee97b79124fda28c87ff3a28cbe3 Binary files /dev/null and b/tortoise/voices/pat2/00100.mp3 differ diff --git a/tortoise/voices/pat2/00112.mp3 b/tortoise/voices/pat2/00112.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..4b27bef85112dc59da0f44d7f4ee5a65851bf4c9 Binary files /dev/null and b/tortoise/voices/pat2/00112.mp3 differ diff --git a/tortoise/voices/pat2/00130.mp3 b/tortoise/voices/pat2/00130.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..36b5e5487dffbeade03737e87fe207aef971981d Binary files /dev/null and b/tortoise/voices/pat2/00130.mp3 differ diff --git a/tortoise/voices/pat2/00159.mp3 b/tortoise/voices/pat2/00159.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..65b41e0dbc043527eafb11862d98022039f2df46 Binary files /dev/null and b/tortoise/voices/pat2/00159.mp3 differ diff --git a/tortoise/voices/snakes/00115.mp3 b/tortoise/voices/snakes/00115.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..e9770ba39eb810f439b7f417acf8d10f2c60b931 Binary files /dev/null and b/tortoise/voices/snakes/00115.mp3 differ diff --git a/tortoise/voices/snakes/00162.mp3 b/tortoise/voices/snakes/00162.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..503aa7274d23ce55d64d5bbcc9afaf331370b8e0 Binary files /dev/null and b/tortoise/voices/snakes/00162.mp3 differ diff --git a/tortoise/voices/snakes/03504.mp3 b/tortoise/voices/snakes/03504.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..bd4f03946102f92f89a487a1421efaccfbea344d Binary files /dev/null and b/tortoise/voices/snakes/03504.mp3 differ diff --git a/tortoise/voices/tim_reynolds/1.mp3 b/tortoise/voices/tim_reynolds/1.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..445db30cf554422be9b76b20617d264cb715f173 Binary files /dev/null and b/tortoise/voices/tim_reynolds/1.mp3 differ diff --git a/tortoise/voices/tim_reynolds/2.mp3 b/tortoise/voices/tim_reynolds/2.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6f09722dedffdd6829a13327d7536bb6938e3aea Binary files /dev/null and b/tortoise/voices/tim_reynolds/2.mp3 differ diff --git a/tortoise/voices/tim_reynolds/3.mp3 b/tortoise/voices/tim_reynolds/3.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..717a7ed0e6fd7488903043abade196c933ebad2e Binary files /dev/null and b/tortoise/voices/tim_reynolds/3.mp3 differ diff --git a/tortoise/voices/tim_reynolds/4.mp3 b/tortoise/voices/tim_reynolds/4.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..458d8121e810881589fd3f0d96ab27a4749e81f8 Binary files /dev/null and b/tortoise/voices/tim_reynolds/4.mp3 differ diff --git a/tortoise/voices/tom/1.wav b/tortoise/voices/tom/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..4e91bf9d20e2cbea2cdced3ad1133d8d7f2f6765 Binary files /dev/null and b/tortoise/voices/tom/1.wav differ diff --git a/tortoise/voices/tom/2.wav b/tortoise/voices/tom/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..fb3d38d47cfcc4debd4d1cab7c4788ba8c39d0d7 Binary files /dev/null and b/tortoise/voices/tom/2.wav differ diff --git a/tortoise/voices/tom/3.wav b/tortoise/voices/tom/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..07b0b14ceed1c0ff414a7473c9255bb31667ea40 Binary files /dev/null and b/tortoise/voices/tom/3.wav differ diff --git a/tortoise/voices/tom/4.wav b/tortoise/voices/tom/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..0c64b0ebeb84cf4dd489b1073596f3df0b145538 Binary files /dev/null and b/tortoise/voices/tom/4.wav differ diff --git a/tortoise/voices/train_atkins/1.wav b/tortoise/voices/train_atkins/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..cf721d3f6c9998e000f1b30ce27f6c86c698cf05 Binary files /dev/null and b/tortoise/voices/train_atkins/1.wav differ diff --git a/tortoise/voices/train_atkins/2.wav b/tortoise/voices/train_atkins/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..096b0b277908ff857c33e3de43d98339306e1651 Binary files /dev/null and b/tortoise/voices/train_atkins/2.wav differ diff --git a/tortoise/voices/train_dotrice/1.wav b/tortoise/voices/train_dotrice/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..7babde7c9827dbdeb3bdbd30b0e7f62e98fe0f33 Binary files /dev/null and b/tortoise/voices/train_dotrice/1.wav differ diff --git a/tortoise/voices/train_dotrice/2.wav b/tortoise/voices/train_dotrice/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..8f41a826e2de82263e22e49ef67ec6f05d79a74f Binary files /dev/null and b/tortoise/voices/train_dotrice/2.wav differ diff --git a/tortoise/voices/train_grace/1.wav b/tortoise/voices/train_grace/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..b2a243cea416cad34ea374b3964ecb0f8774211e Binary files /dev/null and b/tortoise/voices/train_grace/1.wav differ diff --git a/tortoise/voices/train_grace/2.wav b/tortoise/voices/train_grace/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..41ca66e8b6ef9d65235c5a6bf42bdff8afeee42d Binary files /dev/null and b/tortoise/voices/train_grace/2.wav differ diff --git a/tortoise/voices/train_kennard/1.wav b/tortoise/voices/train_kennard/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..d98ca272441703d70a195f2c098a78a4ff6f100e Binary files /dev/null and b/tortoise/voices/train_kennard/1.wav differ diff --git a/tortoise/voices/train_kennard/2.wav b/tortoise/voices/train_kennard/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..9548fb907cadf579d96393b8e2d99c8d378eeaa4 Binary files /dev/null and b/tortoise/voices/train_kennard/2.wav differ diff --git a/tortoise/voices/train_lescault/1.wav b/tortoise/voices/train_lescault/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..f64a714fa1b719e1ceeef45bfe80255d83a3db07 Binary files /dev/null and b/tortoise/voices/train_lescault/1.wav differ diff --git a/tortoise/voices/train_lescault/2.wav b/tortoise/voices/train_lescault/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..cb42f94f1f1009a113fc1278163357ef028423ef Binary files /dev/null and b/tortoise/voices/train_lescault/2.wav differ diff --git a/tortoise/voices/weaver/1.wav b/tortoise/voices/weaver/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..7283087f36455bf350198051b1ecbfce5132df39 Binary files /dev/null and b/tortoise/voices/weaver/1.wav differ diff --git a/tortoise/voices/weaver/2.wav b/tortoise/voices/weaver/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..de7206e1e8c691995a357d25854547f4bd26d367 Binary files /dev/null and b/tortoise/voices/weaver/2.wav differ diff --git a/tortoise/voices/weaver/3.wav b/tortoise/voices/weaver/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..6b4b4feb435dda18d8c8b8d27622c32c70819d3a Binary files /dev/null and b/tortoise/voices/weaver/3.wav differ diff --git a/tortoise/voices/william/1.wav b/tortoise/voices/william/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..15ef32bc8f2f26ccfba93bb8ae69a408192171ac Binary files /dev/null and b/tortoise/voices/william/1.wav differ diff --git a/tortoise/voices/william/2.wav b/tortoise/voices/william/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..f72eb62174838d2fd690b90122646da6d284603b Binary files /dev/null and b/tortoise/voices/william/2.wav differ diff --git a/tortoise/voices/william/3.wav b/tortoise/voices/william/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..d9b4002061110042e0713d8386e97334226d3de4 Binary files /dev/null and b/tortoise/voices/william/3.wav differ diff --git a/tortoise/voices/william/4.wav b/tortoise/voices/william/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..e03c1812224bbbd9cd54e260e25f953e92448a44 Binary files /dev/null and b/tortoise/voices/william/4.wav differ diff --git a/tortoise_tts.ipynb b/tortoise_tts.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b0230e3967130986dc6cce034649258ffb0b27d8 --- /dev/null +++ b/tortoise_tts.ipynb @@ -0,0 +1,185 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "tortoise-tts.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Welcome to Tortoise! 🐢🐢🐢🐢\n", + "\n", + "Before you begin, I **strongly** recommend you turn on a GPU runtime.\n", + "\n", + "There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU." + ], + "metadata": { + "id": "_pIZ3ZXNp7cf" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JrK20I32grP6" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/neonbjb/tortoise-tts.git\n", + "%cd tortoise-tts\n", + "!pip3 install -r requirements.txt\n", + "!python3 setup.py install" + ] + }, + { + "cell_type": "code", + "source": [ + "# Imports used through the rest of the notebook.\n", + "import torch\n", + "import torchaudio\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "import IPython\n", + "\n", + "from tortoise.api import TextToSpeech\n", + "from tortoise.utils.audio import load_audio, load_voice, load_voices\n", + "\n", + "# This will download all the models used by Tortoise from the HF hub.\n", + "tts = TextToSpeech()" + ], + "metadata": { + "id": "Gen09NM4hONQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# This is the text that will be spoken.\n", + "text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\"\n", + "\n", + "# Here's something for the poetically inclined.. (set text=)\n", + "\"\"\"\n", + "Then took the other, as just as fair,\n", + "And having perhaps the better claim,\n", + "Because it was grassy and wanted wear;\n", + "Though as for that the passing there\n", + "Had worn them really about the same,\"\"\"\n", + "\n", + "# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n", + "preset = \"fast\"" + ], + "metadata": { + "id": "bt_aoxONjfL2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Tortoise will attempt to mimic voices you provide. It comes pre-packaged\n", + "# with some voices you might recognize.\n", + "\n", + "# Let's list all the voices available. These are just some random clips I've gathered\n", + "# from the internet as well as a few voices from the training dataset.\n", + "# Feel free to add your own clips to the voices/ folder.\n", + "%ls tortoise/voices\n", + "\n", + "IPython.display.Audio('tortoise/voices/tom/1.wav')" + ], + "metadata": { + "id": "SSleVnRAiEE2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Pick one of the voices from the output above\n", + "voice = 'tom'\n", + "\n", + "# Load it and send it through Tortoise.\n", + "voice_samples, conditioning_latents = load_voice(voice)\n", + "gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n", + " preset=preset)\n", + "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n", + "IPython.display.Audio('generated.wav')" + ], + "metadata": { + "id": "KEXOKjIvn6NW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Tortoise can also generate speech using a random voice. The voice changes each time you execute this!\n", + "# (Note: random voices can be prone to strange utterances)\n", + "gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)\n", + "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n", + "IPython.display.Audio('generated.wav')" + ], + "metadata": { + "id": "16Xs2SSC3BXa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# You can also combine conditioning voices. Combining voices produces a new voice\n", + "# with traits from all the parents.\n", + "#\n", + "# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n", + "voice_samples, conditioning_latents = load_voices(['pat', 'william'])\n", + "\n", + "gen = tts.tts_with_preset(\"They used to say that if man was meant to fly, he’d have wings. But he did fly. He discovered he had to.\", \n", + " voice_samples=None, conditioning_latents=None, preset=preset)\n", + "torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)\n", + "IPython.display.Audio('captain_kirkard.wav')" + ], + "metadata": { + "id": "fYTk8KUezUr5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "del tts # Will break other cells, but necessary to conserve RAM if you want to run this cell.\n", + "\n", + "# Tortoise comes with some scripts that does a lot of the lifting for you. For example,\n", + "# read.py will read a text file for you.\n", + "!python3 tortoise/read.py --voice=train_atkins --textfile=tortoise/data/riding_hood.txt --preset=ultra_fast --output_path=.\n", + "\n", + "IPython.display.Audio('train_atkins/combined.wav')\n", + "# This will take awhile.." + ], + "metadata": { + "id": "t66yqWgu68KL" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/tortoise_v2_examples.html b/tortoise_v2_examples.html new file mode 100644 index 0000000000000000000000000000000000000000..088c349ae605f9ada48986da8f2de3f9e2e6e741 --- /dev/null +++ b/tortoise_v2_examples.html @@ -0,0 +1,62 @@ +These words were never spoken.

Handpicked results


+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Handpicked longform result:


+

Compared to Tacotron2 (with the LJSpeech voice):

Tacotron2+WaveglowTorToiSe

+

+

+

+

+

+

Various spoken texts for all voices:

+ + + + + + + + + + + + + + + + + + + + +
textangiedanieldeniroemmafreemangeralthallejlawljmyselfpatsnakestomtrain_atkinstrain_dotricetrain_kennardweaverwilliam
reference clip
autoregressive_ml
bengio_it_needs_to_know_what_is_bad
dickinson_stop_for_death
espn_basketball
frost_oar_to_oar
frost_road_not_taken
gatsby_and_so_we_beat_on
harrypotter_differences_of_habit_and_language
i_am_a_language_model
melodie_kao
nyt_covid
real_courage_is_when_you_know_your_licked
rolling_stone_review
spacecraft_interview
tacotron2_sample1
tacotron2_sample2
tacotron2_sample3
tacotron2_sample4
watts_this_is_the_real_secret_of_life
wilde_nowadays_people_know_the_price

Longform result for all voices:


+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/audio.py b/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..cb86566d9fb777343a1b854dabdf8709fba33dc7 --- /dev/null +++ b/utils/audio.py @@ -0,0 +1,143 @@ +import os +from glob import glob + +import torch +import torchaudio +import numpy as np +from scipy.io.wavfile import read + +from utils.stft import STFT + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + if data.dtype == np.int32: + norm_fix = 2 ** 31 + elif data.dtype == np.int16: + norm_fix = 2 ** 15 + elif data.dtype == np.float16 or data.dtype == np.float32: + norm_fix = 1. + else: + raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") + return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate) + + +def load_audio(audiopath, sampling_rate): + if audiopath[-4:] == '.wav': + audio, lsr = load_wav_to_torch(audiopath) + elif audiopath[-4:] == '.mp3': + # https://github.com/neonbjb/pyfastmp3decoder - Definitely worth it. + from pyfastmp3decoder.mp3decoder import load_mp3 + audio, lsr = load_mp3(audiopath, sampling_rate) + audio = torch.FloatTensor(audio) + + # Remove any channel data. + if len(audio.shape) > 1: + if audio.shape[0] < 5: + audio = audio[0] + else: + assert audio.shape[1] < 5 + audio = audio[:, 0] + + if lsr != sampling_rate: + audio = torchaudio.functional.resample(audio, lsr, sampling_rate) + + # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. + # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. + if torch.any(audio > 2) or not torch.any(audio < 0): + print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + audio.clip_(-1, 1) + + return audio.unsqueeze(0) + + +TACOTRON_MEL_MAX = 2.3143386840820312 +TACOTRON_MEL_MIN = -11.512925148010254 + + +def denormalize_tacotron_mel(norm_mel): + return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN + + +def normalize_tacotron_mel(mel): + return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def get_voices(): + subs = os.listdir('voices') + voices = {} + for sub in subs: + subj = os.path.join('voices', sub) + if os.path.isdir(subj): + voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + return voices + + +class TacotronSTFT(torch.nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, + n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, + mel_fmax=8000.0): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + from librosa.filters import mel as librosa_mel_fn + mel_basis = librosa_mel_fn( + sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer('mel_basis', mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert(torch.min(y.data) >= -10) + assert(torch.max(y.data) <= 10) + y = torch.clip(y, min=-1, max=1) + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output + + +def wav_to_univnet_mel(wav, do_normalization=False): + stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000) + stft = stft.cuda() + mel = stft.mel_spectrogram(wav) + if do_normalization: + mel = normalize_tacotron_mel(mel) + return mel \ No newline at end of file diff --git a/utils/diffusion.py b/utils/diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..e877ff22de75c407f067ff2a6280e912eebf7a84 --- /dev/null +++ b/utils/diffusion.py @@ -0,0 +1,1250 @@ +""" +This is an almost carbon copy of gaussian_diffusion.py from OpenAI's ImprovedDiffusion repo, which itself: + +This code started out as a PyTorch port of Ho et al's diffusion models: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py + +Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules. +""" + +import enum +import math + +import numpy as np +import torch +import torch as th +from tqdm import tqdm + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, th.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for th.exp(). + logvar1, logvar2 = [ + x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + th.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * th.exp(-logvar2) + ) + + +def approx_standard_normal_cdf(x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. + """ + return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) + + +def discretized_gaussian_log_likelihood(x, *, means, log_scales): + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. + + :param x: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + :param means: the Gaussian mean Tensor. + :param log_scales: the Gaussian log stddev Tensor. + :return: a tensor like x of log probabilities (in nats). + """ + assert x.shape == means.shape == log_scales.shape + centered_x = x - means + inv_stdv = th.exp(-log_scales) + plus_in = inv_stdv * (centered_x + 1.0 / 255.0) + cdf_plus = approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - 1.0 / 255.0) + cdf_min = approx_standard_normal_cdf(min_in) + log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) + log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = th.where( + x < -0.999, + log_cdf_plus, + th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), + ) + assert log_probs.shape == x.shape + return log_probs + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +class ModelMeanType(enum.Enum): + """ + Which type of output the model predicts. + """ + + PREVIOUS_X = 'previous_x' # the model predicts x_{t-1} + START_X = 'start_x' # the model predicts x_0 + EPSILON = 'epsilon' # the model predicts epsilon + + +class ModelVarType(enum.Enum): + """ + What is used as the model's output variance. + + The LEARNED_RANGE option has been added to allow the model to predict + values between FIXED_SMALL and FIXED_LARGE, making its job easier. + """ + + LEARNED = 'learned' + FIXED_SMALL = 'fixed_small' + FIXED_LARGE = 'fixed_large' + LEARNED_RANGE = 'learned_range' + + +class LossType(enum.Enum): + MSE = 'mse' # use raw MSE loss (and KL when learning variances) + RESCALED_MSE = 'rescaled_mse' # use raw MSE loss (with RESCALED_KL when learning variances) + KL = 'kl' # use the variational lower-bound + RESCALED_KL = 'rescaled_kl' # like KL, but rescale to estimate the full VLB + + def is_vb(self): + return self == LossType.KL or self == LossType.RESCALED_KL + + +class GaussianDiffusion: + """ + Utilities for training and sampling diffusion models. + + Ported directly from here, and then adapted over time to further experimentation. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 + + :param betas: a 1-D numpy array of betas for each diffusion timestep, + starting at T and going to 1. + :param model_mean_type: a ModelMeanType determining what the model outputs. + :param model_var_type: a ModelVarType determining how variance is output. + :param loss_type: a LossType determining the loss function to use. + :param rescale_timesteps: if True, pass floating point timesteps into the + model so that they are always scaled like in the + original paper (0 to 1000). + """ + + def __init__( + self, + *, + betas, + model_mean_type, + model_var_type, + loss_type, + rescale_timesteps=False, + conditioning_free=False, + conditioning_free_k=1, + ramp_conditioning_free=True, + ): + self.model_mean_type = ModelMeanType(model_mean_type) + self.model_var_type = ModelVarType(model_var_type) + self.loss_type = LossType(loss_type) + self.rescale_timesteps = rescale_timesteps + self.conditioning_free = conditioning_free + self.conditioning_free_k = conditioning_free_k + self.ramp_conditioning_free = ramp_conditioning_free + + # Use float64 for accuracy. + betas = np.array(betas, dtype=np.float64) + self.betas = betas + assert len(betas.shape) == 1, "betas must be 1-D" + assert (betas > 0).all() and (betas <= 1).all() + + self.num_timesteps = int(betas.shape[0]) + + alphas = 1.0 - betas + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) + self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) + assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) + self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) + self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) + self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + self.posterior_variance = ( + betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + # log calculation clipped because the posterior variance is 0 at the + # beginning of the diffusion chain. + self.posterior_log_variance_clipped = np.log( + np.append(self.posterior_variance[1], self.posterior_variance[1:]) + ) + self.posterior_mean_coef1 = ( + betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + self.posterior_mean_coef2 = ( + (1.0 - self.alphas_cumprod_prev) + * np.sqrt(alphas) + / (1.0 - self.alphas_cumprod) + ) + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + ) + variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = _extract_into_tensor( + self.log_one_minus_alphas_cumprod, t, x_start.shape + ) + return mean, variance, log_variance + + def q_sample(self, x_start, t, noise=None): + """ + Diffuse the data for a given number of diffusion steps. + + In other words, sample from q(x_t | x_0). + + :param x_start: the initial data batch. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :param noise: if specified, the split-out normal noise. + :return: A noisy version of x_start. + """ + if noise is None: + noise = th.randn_like(x_start) + assert noise.shape == x_start.shape + return ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) + * noise + ) + + def q_posterior_mean_variance(self, x_start, x_t, t): + """ + Compute the mean and variance of the diffusion posterior: + + q(x_{t-1} | x_t, x_0) + + """ + assert x_start.shape == x_t.shape + posterior_mean = ( + _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape + ) + assert ( + posterior_mean.shape[0] + == posterior_variance.shape[0] + == posterior_log_variance_clipped.shape[0] + == x_start.shape[0] + ) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance( + self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None + ): + """ + Apply the model to get p(x_{t-1} | x_t), as well as a prediction of + the initial x, x_0. + + :param model: the model, which takes a signal and a batch of timesteps + as input. + :param x: the [N x C x ...] tensor at time t. + :param t: a 1-D Tensor of timesteps. + :param clip_denoised: if True, clip the denoised signal into [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. Applies before + clip_denoised. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict with the following keys: + - 'mean': the model mean output. + - 'variance': the model variance output. + - 'log_variance': the log of 'variance'. + - 'pred_xstart': the prediction for x_0. + """ + if model_kwargs is None: + model_kwargs = {} + + B, C = x.shape[:2] + assert t.shape == (B,) + model_output = model(x, self._scale_timesteps(t), **model_kwargs) + if self.conditioning_free: + model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs) + + if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: + assert model_output.shape == (B, C * 2, *x.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + if self.conditioning_free: + model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1) + if self.model_var_type == ModelVarType.LEARNED: + model_log_variance = model_var_values + model_variance = th.exp(model_log_variance) + else: + min_log = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x.shape + ) + max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) + # The model_var_values is [-1, 1] for [min_var, max_var]. + frac = (model_var_values + 1) / 2 + model_log_variance = frac * max_log + (1 - frac) * min_log + model_variance = th.exp(model_log_variance) + else: + model_variance, model_log_variance = { + # for fixedlarge, we set the initial (log-)variance like so + # to get a better decoder log likelihood. + ModelVarType.FIXED_LARGE: ( + np.append(self.posterior_variance[1], self.betas[1:]), + np.log(np.append(self.posterior_variance[1], self.betas[1:])), + ), + ModelVarType.FIXED_SMALL: ( + self.posterior_variance, + self.posterior_log_variance_clipped, + ), + }[self.model_var_type] + model_variance = _extract_into_tensor(model_variance, t, x.shape) + model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) + + if self.conditioning_free: + if self.ramp_conditioning_free: + assert t.shape[0] == 1 # This should only be used in inference. + cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps) + else: + cfk = self.conditioning_free_k + model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning + + def process_xstart(x): + if denoised_fn is not None: + x = denoised_fn(x) + if clip_denoised: + return x.clamp(-1, 1) + return x + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + pred_xstart = process_xstart( + self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output) + ) + model_mean = model_output + elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: + if self.model_mean_type == ModelMeanType.START_X: + pred_xstart = process_xstart(model_output) + else: + pred_xstart = process_xstart( + self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) + ) + model_mean, _, _ = self.q_posterior_mean_variance( + x_start=pred_xstart, x_t=x, t=t + ) + else: + raise NotImplementedError(self.model_mean_type) + + assert ( + model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape + ) + return { + "mean": model_mean, + "variance": model_variance, + "log_variance": model_log_variance, + "pred_xstart": pred_xstart, + } + + def _predict_xstart_from_eps(self, x_t, t, eps): + assert x_t.shape == eps.shape + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps + ) + + def _predict_xstart_from_xprev(self, x_t, t, xprev): + assert x_t.shape == xprev.shape + return ( # (xprev - coef2*x_t) / coef1 + _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev + - _extract_into_tensor( + self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape + ) + * x_t + ) + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - pred_xstart + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _scale_timesteps(self, t): + if self.rescale_timesteps: + return t.float() * (1000.0 / self.num_timesteps) + return t + + def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) + new_mean = ( + p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() + ) + return new_mean + + def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn( + x, self._scale_timesteps(t), **model_kwargs + ) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance( + x_start=out["pred_xstart"], x_t=x, t=t + ) + return out + + def p_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + noise = th.randn_like(x) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean( + cond_fn, out, x, t, model_kwargs=model_kwargs + ) + sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def p_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + for i in tqdm(indices, disable=not progress): + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.p_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = ( + eta + * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) + * th.sqrt(1 - alpha_bar / alpha_bar_prev) + ) + # Equation 12. + noise = th.randn_like(x) + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_prev) + + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps + ) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def ddim_reverse_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x + - out["pred_xstart"] + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) + + # Equation 12. reversed + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_next) + + th.sqrt(1 - alpha_bar_next) * eps + ) + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices, disable=not progress) + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.ddim_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + eta=eta, + ) + yield out + img = out["sample"] + + def _vb_terms_bpd( + self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None + ): + """ + Get a term for the variational lower-bound. + + The resulting units are bits (rather than nats, as one might expect). + This allows for comparison to other papers. + + :return: a dict with the following keys: + - 'output': a shape [N] tensor of NLLs or KLs. + - 'pred_xstart': the x_0 predictions. + """ + true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + ) + out = self.p_mean_variance( + model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs + ) + kl = normal_kl( + true_mean, true_log_variance_clipped, out["mean"], out["log_variance"] + ) + kl = mean_flat(kl) / np.log(2.0) + + decoder_nll = -discretized_gaussian_log_likelihood( + x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] + ) + assert decoder_nll.shape == x_start.shape + decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + + # At the first timestep return the decoder NLL, + # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) + output = th.where((t == 0), decoder_nll, kl) + return {"output": output, "pred_xstart": out["pred_xstart"]} + + def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + + terms = {} + + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + # TODO: support multiple model outputs for this mode. + terms["loss"] = self._vb_terms_bpd( + model=model, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + model_kwargs=model_kwargs, + )["output"] + if self.loss_type == LossType.RESCALED_KL: + terms["loss"] *= self.num_timesteps + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs) + if isinstance(model_outputs, tuple): + model_output = model_outputs[0] + terms['extra_outputs'] = model_outputs[1:] + else: + model_output = model_outputs + + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C * 2, *x_t.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + target = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0] + x_start_pred = torch.zeros(x_start) # Not supported. + elif self.model_mean_type == ModelMeanType.START_X: + target = x_start + x_start_pred = model_output + elif self.model_mean_type == ModelMeanType.EPSILON: + target = noise + x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) + else: + raise NotImplementedError(self.model_mean_type) + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + terms["x_start_predicted"] = x_start_pred + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def autoregressive_training_losses(self, model, x_start, t, model_output_keys, gd_out_key, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + terms = {} + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + assert False # not currently supported for this type of diffusion. + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) + terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) + model_output = terms[gd_out_key] + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C, 2, *x_t.shape[2:]) + model_output, model_var_values = model_output[:, :, 0], model_output[:, :, 1] + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + target = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0] + x_start_pred = torch.zeros(x_start) # Not supported. + elif self.model_mean_type == ModelMeanType.START_X: + target = x_start + x_start_pred = model_output + elif self.model_mean_type == ModelMeanType.EPSILON: + target = noise + x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) + else: + raise NotImplementedError(self.model_mean_type) + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + terms["x_start_predicted"] = x_start_pred + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + + This term can't be optimized, as it only depends on the encoder. + + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl( + mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 + ) + return mean_flat(kl_prior) / np.log(2.0) + + def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): + """ + Compute the entire variational lower-bound, measured in bits-per-dim, + as well as other related quantities. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param clip_denoised: if True, clip denoised samples. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + + :return: a dict containing the following keys: + - total_bpd: the total variational lower-bound, per batch element. + - prior_bpd: the prior term in the lower-bound. + - vb: an [N x T] tensor of terms in the lower-bound. + - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. + - mse: an [N x T] tensor of epsilon MSEs for each timestep. + """ + device = x_start.device + batch_size = x_start.shape[0] + + vb = [] + xstart_mse = [] + mse = [] + for t in list(range(self.num_timesteps))[::-1]: + t_batch = th.tensor([t] * batch_size, device=device) + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) + # Calculate VLB term at the current timestep + with th.no_grad(): + out = self._vb_terms_bpd( + model, + x_start=x_start, + x_t=x_t, + t=t_batch, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + ) + vb.append(out["output"]) + xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) + eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) + mse.append(mean_flat((eps - noise) ** 2)) + + vb = th.stack(vb, dim=1) + xstart_mse = th.stack(xstart_mse, dim=1) + mse = th.stack(mse, dim=1) + + prior_bpd = self._prior_bpd(x_start) + total_bpd = vb.sum(dim=1) + prior_bpd + return { + "total_bpd": total_bpd, + "prior_bpd": prior_bpd, + "vb": vb, + "xstart_mse": xstart_mse, + "mse": mse, + } + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +class SpacedDiffusion(GaussianDiffusion): + """ + A diffusion process which can skip steps in a base diffusion process. + + :param use_timesteps: a collection (sequence or set) of timesteps from the + original diffusion process to retain. + :param kwargs: the kwargs to create the base diffusion process. + """ + + def __init__(self, use_timesteps, **kwargs): + self.use_timesteps = set(use_timesteps) + self.timestep_map = [] + self.original_num_steps = len(kwargs["betas"]) + + base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa + last_alpha_cumprod = 1.0 + new_betas = [] + for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): + if i in self.use_timesteps: + new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) + last_alpha_cumprod = alpha_cumprod + self.timestep_map.append(i) + kwargs["betas"] = np.array(new_betas) + super().__init__(**kwargs) + + def p_mean_variance( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) + + def training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().training_losses(self._wrap_model(model), *args, **kwargs) + + def autoregressive_training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs) + + def condition_mean(self, cond_fn, *args, **kwargs): + return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) + + def condition_score(self, cond_fn, *args, **kwargs): + return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) + + def _wrap_model(self, model, autoregressive=False): + if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel): + return model + mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel + return mod( + model, self.timestep_map, self.rescale_timesteps, self.original_num_steps + ) + + def _scale_timesteps(self, t): + # Scaling is done by the wrapped model. + return t + + +def space_timesteps(num_timesteps, section_counts): + """ + Create a list of timesteps to use from an original diffusion process, + given the number of timesteps we want to take from equally-sized portions + of the original process. + + For example, if there's 300 timesteps and the section counts are [10,15,20] + then the first 100 timesteps are strided to be 10 timesteps, the second 100 + are strided to be 15 timesteps, and the final 100 are strided to be 20. + + If the stride is a string starting with "ddim", then the fixed striding + from the DDIM paper is used, and only one section is allowed. + + :param num_timesteps: the number of diffusion steps in the original + process to divide up. + :param section_counts: either a list of numbers, or a string containing + comma-separated numbers, indicating the step count + per section. As a special case, use "ddimN" where N + is a number of steps to use the striding from the + DDIM paper. + :return: a set of diffusion steps from the original process to use. + """ + if isinstance(section_counts, str): + if section_counts.startswith("ddim"): + desired_count = int(section_counts[len("ddim") :]) + for i in range(1, num_timesteps): + if len(range(0, num_timesteps, i)) == desired_count: + return set(range(0, num_timesteps, i)) + raise ValueError( + f"cannot create exactly {num_timesteps} steps with an integer stride" + ) + section_counts = [int(x) for x in section_counts.split(",")] + size_per = num_timesteps // len(section_counts) + extra = num_timesteps % len(section_counts) + start_idx = 0 + all_steps = [] + for i, section_count in enumerate(section_counts): + size = size_per + (1 if i < extra else 0) + if size < section_count: + raise ValueError( + f"cannot divide section of {size} steps into {section_count}" + ) + if section_count <= 1: + frac_stride = 1 + else: + frac_stride = (size - 1) / (section_count - 1) + cur_idx = 0.0 + taken_steps = [] + for _ in range(section_count): + taken_steps.append(start_idx + round(cur_idx)) + cur_idx += frac_stride + all_steps += taken_steps + start_idx += size + return set(all_steps) + + +class _WrappedModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, new_ts, **kwargs) + + +class _WrappedAutoregressiveModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, x0, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, x0, new_ts, **kwargs) + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) \ No newline at end of file diff --git a/utils/stft.py b/utils/stft.py new file mode 100644 index 0000000000000000000000000000000000000000..8de6bfb090c77e0de2c99dd05fde1f8bfd726b51 --- /dev/null +++ b/utils/stft.py @@ -0,0 +1,193 @@ +""" +BSD 3-Clause License + +Copyright (c) 2017, Prem Seetharaman +All rights reserved. + +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +import librosa.util as librosa_util + + +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm)**2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + + +class STFT(torch.nn.Module): + """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" + def __init__(self, filter_length=800, hop_length=200, win_length=800, + window='hann'): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])]) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + + if window is not None: + assert(filter_length >= win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float()) + self.register_buffer('inverse_basis', inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode='reflect') + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part**2 + imag_part**2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data)) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + Variable(self.inverse_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False) + window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] + inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction \ No newline at end of file diff --git a/utils/tokenizer.py b/utils/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..ed7e4cdf079f59c75b38e5b6cfa77c652d23618b --- /dev/null +++ b/utils/tokenizer.py @@ -0,0 +1,187 @@ +import re + +import inflect +import torch +from tokenizers import Tokenizer + + +# Regular expression matching whitespace: +from unidecode import unidecode + +_whitespace_re = re.compile(r'\s+') + + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + text = text.replace('"', '') + return text + +def lev_distance(s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances = distances_ + return distances[-1] + +class VoiceBpeTokenizer: + def __init__(self, vocab_file='data/tokenizer.json'): + if vocab_file is not None: + self.tokenizer = Tokenizer.from_file(vocab_file) + + def preprocess_text(self, txt): + txt = english_cleaners(txt) + return txt + + def encode(self, txt): + txt = self.preprocess_text(txt) + txt = txt.replace(' ', '[SPACE]') + return self.tokenizer.encode(txt).ids + + def decode(self, seq): + if isinstance(seq, torch.Tensor): + seq = seq.cpu().numpy() + txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '') + txt = txt.replace('[SPACE]', ' ') + txt = txt.replace('[STOP]', '') + txt = txt.replace('[UNK]', '') + return txt \ No newline at end of file diff --git a/utils/typical_sampling.py b/utils/typical_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..ff6bf487947e88a55fa45f2ffec1b9540df1d4fd --- /dev/null +++ b/utils/typical_sampling.py @@ -0,0 +1,33 @@ +import torch +from transformers import LogitsWarper + + +class TypicalLogitsWarper(LogitsWarper): + def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + self.filter_value = filter_value + self.mass = mass + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + # calculate entropy + normalized = torch.nn.functional.log_softmax(scores, dim=-1) + p = torch.exp(normalized) + ent = -(normalized * p).nansum(-1, keepdim=True) + + # shift and sort + shifted_scores = torch.abs((-normalized) - ent) + sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False) + sorted_logits = scores.gather(-1, sorted_indices) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Remove tokens with cumulative mass above the threshold + last_ind = (cumulative_probs < self.mass).sum(dim=1) + last_ind[last_ind < 0] = 0 + sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1)) + if self.min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores \ No newline at end of file