Spaces:

Thefrudi78
/

Extra

Runtime error

App Files Files Community

Thefrudi78 commited on Sep 20, 2023

Commit

b0be382

•

1 Parent(s): 68b29a9

Upload 552 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.editorconfig +11 -0
.gitignore +140 -0
LICENSE +24 -0
README.md +548 -10
api_key.txt +1 -0
constants.py +49 -0
data/models/coqui/.placeholder +2 -0
data/models/rvc/.placeholder +3 -0
data/tmp/.placeholder +2 -0
docker/Dockerfile +35 -0
docker/docker-compose.yml +23 -0
docker/readme.md +10 -0
modules/classify/classify_module.py +41 -0
modules/speech_recognition/streaming_module.py +121 -0
modules/speech_recognition/vosk_module.py +77 -0
modules/speech_recognition/whisper_module.py +56 -0
modules/text_to_speech/coqui/coqui_module.py +333 -0
modules/utils.py +15 -0
modules/voice_conversion/fairseq/LICENSE +21 -0
modules/voice_conversion/fairseq/__init__.py +45 -0
modules/voice_conversion/fairseq/binarizer.py +381 -0
modules/voice_conversion/fairseq/checkpoint_utils.py +905 -0
modules/voice_conversion/fairseq/data/__init__.py +130 -0
modules/voice_conversion/fairseq/data/add_target_dataset.py +83 -0
modules/voice_conversion/fairseq/data/append_token_dataset.py +41 -0
modules/voice_conversion/fairseq/data/audio/__init__.py +93 -0
modules/voice_conversion/fairseq/data/audio/audio_utils.py +389 -0
modules/voice_conversion/fairseq/data/audio/data_cfg.py +387 -0
modules/voice_conversion/fairseq/data/audio/dataset_transforms/__init__.py +53 -0
modules/voice_conversion/fairseq/data/audio/dataset_transforms/concataugment.py +61 -0
modules/voice_conversion/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py +105 -0
modules/voice_conversion/fairseq/data/audio/feature_transforms/__init__.py +43 -0
modules/voice_conversion/fairseq/data/audio/feature_transforms/delta_deltas.py +37 -0
modules/voice_conversion/fairseq/data/audio/feature_transforms/global_cmvn.py +29 -0
modules/voice_conversion/fairseq/data/audio/feature_transforms/specaugment.py +131 -0
modules/voice_conversion/fairseq/data/audio/feature_transforms/utterance_cmvn.py +41 -0
modules/voice_conversion/fairseq/data/audio/frm_text_to_speech_dataset.py +205 -0
modules/voice_conversion/fairseq/data/audio/hubert_dataset.py +356 -0
modules/voice_conversion/fairseq/data/audio/multi_modality_dataset.py +284 -0
modules/voice_conversion/fairseq/data/audio/raw_audio_dataset.py +393 -0
modules/voice_conversion/fairseq/data/audio/speech_to_speech_dataset.py +379 -0
modules/voice_conversion/fairseq/data/audio/speech_to_text_dataset.py +733 -0
modules/voice_conversion/fairseq/data/audio/speech_to_text_joint_dataset.py +359 -0
modules/voice_conversion/fairseq/data/audio/text_to_speech_dataset.py +250 -0
modules/voice_conversion/fairseq/data/audio/waveform_transforms/__init__.py +48 -0
modules/voice_conversion/fairseq/data/audio/waveform_transforms/noiseaugment.py +201 -0
modules/voice_conversion/fairseq/data/backtranslation_dataset.py +165 -0
modules/voice_conversion/fairseq/data/base_wrapper_dataset.py +78 -0
modules/voice_conversion/fairseq/data/bucket_pad_length_dataset.py +78 -0
modules/voice_conversion/fairseq/data/codedataset.py +576 -0

.editorconfig ADDED Viewed

	@@ -0,0 +1,11 @@

+root = true
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+[*.{py,js,html,css,scss,md}]
+charset = utf-8
+indent_style = space
+indent_size = 4

.gitignore ADDED Viewed

	@@ -0,0 +1,140 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+data/
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+debug.png
+test.wav
+/tts_samples
+model.pt
+.DS_Store
+.chroma
+/.chroma_db
+api_key.txt
+.vscode

LICENSE ADDED Viewed

	@@ -0,0 +1,24 @@

+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+For more information, please refer to <https://unlicense.org>

README.md CHANGED Viewed

@@ -1,10 +1,548 @@
----
-title: Extra
-emoji: 🔥
-colorFrom: green
-colorTo: blue
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SillyTavern - Extras
+## Recent news
+* July 25 2023 - Now extras require Python 3.11 to run, some of the modules new will be incompatible with old Python 3.10 installs. To migrate using conda, please remove old environment using `conda remove --name extras --all` and reinstall using the instructions below.
+## What is this
+A set of APIs for various SillyTavern extensions.
+**You need to run the latest version of SillyTavern. Grab it here: [How to install](https://docs.sillytavern.app/installation/windows/), [Git repository](https://github.com/SillyTavern/SillyTavern)**
+All modules, except for Stable Diffusion, run on the CPU by default. However, they can alternatively be configured to use CUDA (with `--cuda` command line option). When running all modules simultaneously, you can expect a usage of approximately 6 GB of RAM. Loading Stable Diffusion adds an additional couple of GB to the memory usage.
+Try on Colab (will give you a link to Extras API):  <a target="_blank" href="https://colab.research.google.com/github/SillyTavern/SillyTavern/blob/release/colab/GPU.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a>
+Colab link:
+https://colab.research.google.com/github/SillyTavern/SillyTavern/blob/release/colab/GPU.ipynb
+Documentation:
+https://docs.sillytavern.app/
+## How to run
+### :exclamation: **IMPORTANT!**
+ Default **requirements.txt** contains only basic packages for text processing
+If you want to use the most advanced features (like Stable Diffusion, TTS), change that to **requirements-complete.txt** in commands below. See [Modules](#modules) section for more details.
+If you run on Apple Silicon (M1/M2), use the **requirements-silicon.txt** file instead.
+### Getting an error when installing from requirements-complete.txt?
+> ERROR: Could not build wheels for hnswlib, which is required to install pyproject.toml-based projects
+Installing chromadb package requires one of the following:
+1. Have Visual C++ build tools installed: https://visualstudio.microsoft.com/visual-cpp-build-tools/
+2. Installing hnswlib from conda: `conda install -c conda-forge hnswlib`
+### Missing modules reported by SillyTavern extensions menu?
+You must specify a list of module names to be run in the `--enable-modules` command (`caption` provided as an example). See [Modules](#modules) section.
+### ☁️ Colab
+* Open colab link
+* Select desired "extra" options and start the cell
+* Wait for it to finish
+* Get an API URL link from colab output under the `### SillyTavern Extensions LINK ###` title
+* Start SillyTavern with extensions support: set `enableExtensions` to `true` in config.conf
+* Navigate to SillyTavern extensions menu and put in an API URL and tap "Connect" to load the extensions
+### What about mobile/Android/Termux? 🤔
+There are some folks in the community having success running Extras on their phones via Ubuntu on Termux. This project wasn't made with mobile support in mind, so this guide is provided strictly for your information only: https://rentry.org/STAI-Termux#downloading-and-running-tai-extras
+#### ❗ IMPORTANT!
+We will NOT provide any support for running this on Android. Direct all your questions to the creator of this guide.
+#### Talkinghead module on Linux
+It requires the installation of an additional package because it's not installed automatically due to incompatibility with Colab. Run this after you install other requirements:
+`pip install wxpython==4.2.1`
+### 💻 Locally
+#### Option 1 - Conda (recommended) 🐍
+**PREREQUISITES**
+* Install Miniconda: https://docs.conda.io/en/latest/miniconda.html
+* _(Important!) Read how to use Conda: https://conda.io/projects/conda/en/latest/user-guide/getting-started.html_
+* Install git: https://git-scm.com/downloads
+**EXECUTE THESE COMMANDS ONE BY ONE IN THE _CONDA COMMAND PROMPT_.**
+**TYPE/PASTE EACH COMMAND INTO THE PROMPT, HIT ENTER AND WAIT FOR IT TO FINISH!**
+* Before the first run, create an environment (let's call it `extras`):
+```
+conda create -n extras
+```
+* Now activate the newly created env
+```
+conda activate extras
+```
+* Install Python 3.11
+```
+conda install python=3.11
+```
+* Install the required system packages
+```
+conda install git
+```
+* Clone this repository
+```
+git clone https://github.com/SillyTavern/SillyTavern-extras
+```
+* Navigated to the freshly cloned repository
+```
+cd SillyTavern-extras
+```
+* Install the project requirements
+```
+pip install -r requirements.txt
+```
+* Run the Extensions API server
+```
+python server.py --enable-modules=caption,summarize,classify
+```
+* Copy the Extra's server API URL listed in the console window after it finishes loading up. On local installs, this defaults to `http://localhost:5100`.
+* Open your SillyTavern config.conf file (located in the base install folder), and look for a line "`const enableExtensions`". Make sure that line has "`= true`", and not "`= false`".
+* Start your SillyTavern server
+* Open the Extensions panel (via the 'Stacked Blocks' icon at the top of the page), paste the API URL into the input box, and click "Connect" to connect to the Extras extension server.
+* To run again, simply activate the environment and run these commands. Be sure to the additional options for server.py (see below) that your setup requires.
+```
+conda activate extras
+python server.py
+```
+#### Option 2 - Vanilla 🍦
+* Install Python 3.11: https://www.python.org/downloads/release/python-3114/
+* Install git: https://git-scm.com/downloads
+* Clone the repo:
+```
+git clone https://github.com/SillyTavern/SillyTavern-extras
+cd SillyTavern-extras
+```
+* Run `python -m pip install -r requirements.txt`
+* Run `python server.py --enable-modules=caption,summarize,classify`
+* Get the API URL. Defaults to `http://localhost:5100` if you run locally.
+* Start SillyTavern with extensions support: set `enableExtensions` to `true` in config.conf
+* Navigate to the SillyTavern extensions menu and put in an API URL and tap "Connect" to load the extensions
+## Modules
+| Name        | Description                       | Included in default requirements.txt       |
+| ----------- | --------------------------------- | ------ |
+| `caption`   | Image captioning                  | ✔️ Yes        |
+| `summarize` | Text summarization                | ✔️ Yes    |
+| `classify`  | Text sentiment classification     | ✔️ Yes      |
+| `sd`        | Stable Diffusion image generation | :x: No (✔️ remote)      |
+| `silero-tts`       | [Silero TTS server](https://github.com/ouoertheo/silero-api-server) | :x: No |
+| `edge-tts` | [Microsoft Edge TTS client](https://github.com/rany2/edge-tts) | ✔️ Yes |
+| `coqui-tts` | [Coqui TTS server](https://github.com/coqui-ai/TTS) | :x: No |
+| `chromadb`  | Infinity context server           | :x: No |
+| `talkinghead`  | Talking Head Sprites           | :x: No |
+## Additional options
+| Flag                     | Description                                                            |
+| ------------------------ | ---------------------------------------------------------------------- |
+| `--enable-modules`       | **Required option**. Provide a list of enabled modules.<br>Expects a comma-separated list of module names. See [Modules](#modules)<br>Example: `--enable-modules=caption,sd` |
+| `--port`                 | Specify the port on which the application is hosted. Default: **5100** |
+| `--listen`               | Host the app on the local network                                      |
+| `--share`                | Share the app on CloudFlare tunnel                                     |
+| `--secure`               | Adds API key authentication requirements. Highly recommended when paired with share! |
+| `--cpu`                  | Run the models on the CPU instead of CUDA. Enabled by default. |
+| `--mps` or `--m1`        | Run the models on Apple Silicon. Only for M1 and M2 processors. |
+| `--cuda`                 | Uses CUDA (GPU+VRAM) to run modules if it is available. Otherwise, falls back to using CPU. |
+| `--cuda-device`          | Specifies a CUDA device to use. Defaults to `cuda:0` (first available GPU). |
+| `--talkinghead-gpu`           | Uses GPU for talkinghead (10x FPS increase in animation). |
+| `--coqui-gpu`            | Uses GPU for coqui TTS (if available). |
+| `--coqui-model`          | If provided, downloads and preloads a coqui TTS model. Default: none.<br>Example: `tts_models/multilingual/multi-dataset/bark` |
+| `--summarization-model`  | Load a custom summarization model.<br>Expects a HuggingFace model ID.<br>Default: [Qiliang/bart-large-cnn-samsum-ChatGPT_v3](https://huggingface.co/Qiliang/bart-large-cnn-samsum-ChatGPT_v3) |
+| `--classification-model` | Load a custom sentiment classification model.<br>Expects a HuggingFace model ID.<br>Default (6 emotions): [nateraw/bert-base-uncased-emotion](https://huggingface.co/nateraw/bert-base-uncased-emotion)<br>Other solid option is (28 emotions): [joeddav/distilbert-base-uncased-go-emotions-student](https://huggingface.co/joeddav/distilbert-base-uncased-go-emotions-student)<br>For Chinese language: [touch20032003/xuyuan-trial-sentiment-bert-chinese](https://huggingface.co/touch20032003/xuyuan-trial-sentiment-bert-chinese) |
+| `--captioning-model`     | Load a custom captioning model.<br>Expects a HuggingFace model ID.<br>Default: [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) |
+| `--embedding-model`      | Load a custom text embedding model.<br>Expects a HuggingFace model ID.<br>Default: [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) |
+| `--chroma-host`          | Specifies a host IP for a remote ChromaDB server. |
+| `--chroma-port`          | Specifies an HTTP port for a remote ChromaDB server.<br>Default: `8000` |
+| `--sd-model`             | Load a custom Stable Diffusion image generation model.<br>Expects a HuggingFace model ID.<br>Default: [ckpt/anything-v4.5-vae-swapped](https://huggingface.co/ckpt/anything-v4.5-vae-swapped)<br>*Must have VAE pre-baked in PyTorch format or the output will look drab!* |
+| `--sd-cpu`               | Force the Stable Diffusion generation pipeline to run on the CPU.<br>**SLOW!** |
+| `--sd-remote`            | Use a remote SD backend.<br>**Supported APIs: [sd-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)**  |
+| `--sd-remote-host`       | Specify the host of the remote SD backend<br>Default: **127.0.0.1** |
+| `--sd-remote-port`       | Specify the port of the remote SD backend<br>Default: **7860** |
+| `--sd-remote-ssl`        | Use SSL for the remote SD backend<br>Default: **False** |
+| `--sd-remote-auth`       | Specify the `username:password` for the remote SD backend (if required) |
+## Coqui TTS
+### Running on Mac M1
+#### ImportError: symbol not found
+If you're getting the following error when running coqui-tts module on M1 Mac:
+```
+ImportError: dlopen(/Users/user/.../lib/python3.11/site-packages/MeCab/_MeCab.cpython-311-darwin.so, 0x0002): symbol not found in flat namespace '__ZN5MeCab11createModelEPKc'
+```
+Do the following:
+1. Install homebrew: https://brew.sh/
+2. Build and install the `mecab` package
+```
+brew install --build-from-source mecab
+ARCHFLAGS='-arch arm64' pip install --no-binary :all: --compile --use-pep517 --no-cache-dir --force mecab-python3
+```
+## ChromaDB
+ChromaDB is a blazing fast and open source database that is used for long-term memory when chatting with characters. It can be run in-memory or on a local server on your LAN.
+NOTE: You should **NOT** run ChromaDB on a cloud server. There are no methods for authentication (yet), so unless you want to expose an unauthenticated ChromaDB to the world, run this on a local server in your LAN.
+### In-memory setup
+Run the extras server with the `chromadb` module enabled (recommended).
+### Remote setup
+Use this if you want to use ChromaDB with docker or host it remotely. If you don't know what that means and only want to use ChromaDB with ST on your local device, use the 'in-memory' instructions instead.
+Prerequisites: Docker, Docker compose (make sure you're running in rootless mode with the systemd service enabled if on Linux).
+Steps:
+1. Run `git clone https://github.com/chroma-core/chroma chromadb` and `cd chromadb`
+2. Run `docker-compose up -d --build` to build ChromaDB. This may take a long time depending on your system
+3. Once the build process is finished, ChromaDB should be running in the background. You can check with the command `docker ps`
+4. On your client machine, specify your local server ip in the `--chroma-host` argument (ex. `--chroma-host=192.168.1.10`)
+If you are running ChromaDB on the same machine as SillyTavern, you will have to change the port of one of the services. To do this for ChromaDB:
+1. Run `docker ps` to get the container ID and then `docker container stop <container ID>`
+2. Enter the ChromaDB git repository `cd chromadb`
+3. Open `docker-compose.yml` and look for the line starting with `uvicorn chromadb.app:app`
+4. Change the `--port` argument to whatever port you want.
+5. Look for the `ports` category and change the occurrences of `8000` to whatever port you chose in step 4.
+6. Save and exit. Then run `docker-compose up --detach`
+7. On your client machine, make sure to specity the `--chroma-port` argument (ex. `--chroma-port=<your-port-here>`) along with the `--chroma-host` argument.
+## API Endpoints
+### Get active list
+`GET /api/modules`
+#### **Input**
+None
+#### **Output**
+```
+{"modules":["caption", "classify", "summarize"]}
+```
+### Image captioning
+`POST /api/caption`
+#### **Input**
+```
+{ "image": "base64 encoded image" }
+```
+#### **Output**
+```
+{ "caption": "caption of the posted image" }
+```
+### Text summarization
+`POST /api/summarize`
+#### **Input**
+```
+{ "text": "text to be summarize", "params": {} }
+```
+#### **Output**
+```
+{ "summary": "summarized text" }
+```
+#### Optional: `params` object for control over summarization:
+| Name                  | Default value                                                 |
+| --------------------- | ------------------------------------------------------------- |
+| `temperature`         | 1.0                                                           |
+| `repetition_penalty`  | 1.0                                                           |
+| `max_length`          | 500                                                           |
+| `min_length`          | 200                                                           |
+| `length_penalty`      | 1.5                                                           |
+| `bad_words`           | ["\n", '"', "*", "[", "]", "{", "}", ":", "(", ")", "<", ">"] |
+### Text sentiment classification
+`POST /api/classify`
+#### **Input**
+```
+{ "text": "text to classify sentiment of" }
+```
+#### **Output**
+```
+{
+    "classification": [
+        {
+            "label": "joy",
+            "score": 1.0
+        },
+        {
+            "label": "anger",
+            "score": 0.7
+        },
+        {
+            "label": "love",
+            "score": 0.6
+        },
+        {
+            "label": "sadness",
+            "score": 0.5
+        },
+        {
+            "label": "fear",
+            "score": 0.4
+        },
+        {
+            "label": "surprise",
+            "score": 0.3
+        }
+    ]
+}
+```
+> **NOTES**
+> 1. Sorted by descending score order
+> 2. List of categories defined by the summarization model
+> 3. Value range from 0.0 to 1.0
+### Stable Diffusion image generation
+`POST /api/image`
+#### **Input**
+```
+{ "prompt": "prompt to be generated", "sampler": "DDIM", "steps": 20, "scale": 6, "model": "model_name" }
+```
+#### **Output**
+```
+{ "image": "base64 encoded image" }
+```
+> **NOTES**
+> 1. Only the "prompt" parameter is required
+> 2. Both "sampler" and "model" parameters only work when using a remote SD backend
+### Get available Stable Diffusion models
+`GET /api/image/models`
+#### **Output**
+```
+{ "models": [list of all available model names] }
+```
+### Get available Stable Diffusion samplers
+`GET /api/image/samplers`
+#### **Output**
+```
+{ "samplers": [list of all available sampler names] }
+```
+### Get currently loaded Stable Diffusion model
+`GET /api/image/model`
+#### **Output**
+```
+{ "model": "name of the current loaded model" }
+```
+### Load a Stable Diffusion model (remote)
+`POST /api/image/model`
+#### **Input**
+```
+{ "model": "name of the model to load" }
+```
+#### **Output**
+```
+{ "previous_model": "name of the previous model", "current_model": "name of the newly loaded model" }
+```
+### Generate Silero TTS voice
+`POST /api/tts/generate`
+#### **Input**
+```
+{ "speaker": "speaker voice_id", "text": "text to narrate" }
+```
+#### **Output**
+WAV audio file.
+### Get Silero TTS voices
+`GET /api/tts/speakers`
+#### **Output**
+```
+[
+    {
+        "name": "en_0",
+        "preview_url": "http://127.0.0.1:5100/api/tts/sample/en_0",
+        "voice_id": "en_0"
+    }
+]
+```
+### Get Silero TTS voice sample
+`GET /api/tts/sample/<voice_id>`
+#### **Output**
+WAV audio file.
+### Add messages to chromadb
+`POST /api/chromadb`
+#### **Input**
+```
+{
+    "chat_id": "chat1 - 2023-12-31",
+    "messages": [
+        {
+            "id": "633a4bd1-8350-46b5-9ef2-f5d27acdecb7",
+            "date": 1684164339877,
+            "role": "user",
+            "content": "Hello, AI world!",
+            "meta": "this is meta"
+        },
+        {
+            "id": "8a2ed36b-c212-4a1b-84a3-0ffbe0896506",
+            "date": 1684164411759,
+            "role": "assistant",
+            "content": "Hello, Hooman!"
+        },
+    ]
+}
+```
+#### **Output**
+```
+{ "count": 2 }
+```
+### Query chromadb
+`POST /api/chromadb/query`
+#### **Input**
+```
+{
+    "chat_id": "chat1 - 2023-12-31",
+    "query": "Hello",
+    "n_results": 2,
+}
+```
+#### **Output**
+```
+[
+    {
+        "id": "633a4bd1-8350-46b5-9ef2-f5d27acdecb7",
+        "date": 1684164339877,
+        "role": "user",
+        "content": "Hello, AI world!",
+        "distance": 0.31,
+        "meta": "this is meta"
+    },
+    {
+        "id": "8a2ed36b-c212-4a1b-84a3-0ffbe0896506",
+        "date": 1684164411759,
+        "role": "assistant",
+        "content": "Hello, Hooman!",
+        "distance": 0.29
+    },
+]
+```
+### Delete the messages from chromadb
+`POST /api/chromadb/purge`
+#### **Input**
+```
+{ "chat_id": "chat1 - 2023-04-12" }
+```
+### Get a list of Edge TTS voices
+`GET /api/edge-tts/list`
+#### **Output**
+```
+[{'Name': 'Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)', 'ShortName': 'af-ZA-AdriNeural', 'Gender': 'Female', 'Locale': 'af-ZA', 'SuggestedCodec': 'audio-24khz-48kbitrate-mono-mp3', 'FriendlyName': 'Microsoft Adri Online (Natural) - Afrikaans (South Africa)', 'Status': 'GA', 'VoiceTag': {'ContentCategories': ['General'], 'VoicePersonalities': ['Friendly', 'Positive']}}]
+```
+### Generate Edge TTS voice
+`POST /api/edge-tts/generate`
+#### **Input**
+```
+{ "text": "Text to narrate", "voice": "af-ZA-AdriNeural", "rate": 0 }
+```
+#### **Output**
+MP3 audio file.
+### Load a Coqui TTS model
+`GET /api/coqui-tts/load`
+#### **Input**
+_model (string, required): The name of the Coqui TTS model to load.
+_gpu (string, Optional): Use the GPU to load model.
+_progress (string, Optional): Show progress bar in terminal.
+```
+{ "_model": "tts_models--en--jenny--jenny\model.pth" }
+{ "_gpu": "False" }
+{ "_progress": "True" }
+```
+#### **Output**
+"Loaded"
+### Get a list of Coqui TTS voices
+`GET /api/coqui-tts/list`
+#### **Output**
+```
+["tts_models--en--jenny--jenny\\model.pth", "tts_models--en--ljspeech--fast_pitch\\model_file.pth", "tts_models--en--ljspeech--glow-tts\\model_file.pth", "tts_models--en--ljspeech--neural_hmm\\model_file.pth", "tts_models--en--ljspeech--speedy-speech\\model_file.pth", "tts_models--en--ljspeech--tacotron2-DDC\\model_file.pth", "tts_models--en--ljspeech--vits\\model_file.pth", "tts_models--en--ljspeech--vits--neon\\model_file.pth.tar", "tts_models--en--multi-dataset--tortoise-v2", "tts_models--en--vctk--vits\\model_file.pth", "tts_models--et--cv--vits\\model_file.pth.tar", "tts_models--multilingual--multi-dataset--bark", "tts_models--multilingual--multi-dataset--your_tts\\model_file.pth", "tts_models--multilingual--multi-dataset--your_tts\\model_se.pth"]
+```
+### Get a list of the loaded Coqui model speakers
+`GET /api/coqui-tts/multspeaker`
+#### **Output**
+```
+{"0": "female-en-5", "1": "female-en-5\n", "2": "female-pt-4\n", "3": "male-en-2", "4": "male-en-2\n", "5": "male-pt-3\n"}
+```
+### Get a list of the loaded Coqui model lanagauges
+`GET /api/coqui-tts/multlang`
+#### **Output**
+```
+{"0": "en", "1": "fr-fr", "2": "pt-br"}
+```
+### Generate Coqui TTS voice
+`POST /api/edge-tts/generate`
+#### **Input**
+```
+{
+  "text": "Text to narrate",
+  "speaker_id": "0",
+  "mspker": null,
+  "language_id": null,
+  "style_wav": null
+}
+```
+#### **Output**
+MP3 audio file.
+### Loads a talkinghead character by specifying the character's image URL.
+`GET /api/talkinghead/load`
+#### **Parameters**
+loadchar (string, required): The URL of the character's image. The URL should point to a PNG image.
+{ "loadchar": "http://localhost:8000/characters/Aqua.png" }
+#### **Example**
+'http://localhost:5100/api/talkinghead/load?loadchar=http://localhost:8000/characters/Aqua.png'
+#### **Output**
+'OK'
+### Animates the talkinghead sprite to start talking.
+`GET /api/talkinghead/start_talking`
+#### **Example**
+'http://localhost:5100/api/talkinghead/start_talking'
+#### **Output**
+"started"
+### Animates the talkinghead sprite to stop talking.
+`GET /api/talkinghead/stop_talking`
+#### **Example**
+'http://localhost:5100/api/talkinghead/stop_talking'
+#### **Output**
+"stopped"
+### Outputs the animated talkinghead sprite.
+`GET /api/talkinghead/result_feed`
+#### **Output**
+Animated transparent image

api_key.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ CHANGEME

constants.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Constants
+DEFAULT_CUDA_DEVICE = "cuda:0"
+# Also try: 'Qiliang/bart-large-cnn-samsum-ElectrifAi_v10'
+DEFAULT_SUMMARIZATION_MODEL = "Qiliang/bart-large-cnn-samsum-ChatGPT_v3"
+# Also try: 'joeddav/distilbert-base-uncased-go-emotions-student'
+DEFAULT_CLASSIFICATION_MODEL = "nateraw/bert-base-uncased-emotion"
+# Also try: 'Salesforce/blip-image-captioning-base'
+DEFAULT_CAPTIONING_MODEL = "Salesforce/blip-image-captioning-large"
+DEFAULT_SD_MODEL = "ckpt/anything-v4.5-vae-swapped"
+DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
+DEFAULT_REMOTE_SD_HOST = "127.0.0.1"
+DEFAULT_REMOTE_SD_PORT = 7860
+DEFAULT_CHROMA_PORT = 8000
+SILERO_SAMPLES_PATH = "tts_samples"
+SILERO_SAMPLE_TEXT = "The quick brown fox jumps over the lazy dog"
+DEFAULT_SUMMARIZE_PARAMS = {
+    "temperature": 1.0,
+    "repetition_penalty": 1.0,
+    "max_length": 500,
+    "min_length": 200,
+    "length_penalty": 1.5,
+    "bad_words": [
+        "\n",
+        '"',
+        "*",
+        "[",
+        "]",
+        "{",
+        "}",
+        ":",
+        "(",
+        ")",
+        "<",
+        ">",
+        "Â",
+        "The text ends",
+        "The story ends",
+        "The text is",
+        "The story is",
+    ],
+}
+PROMPT_PREFIX = "best quality, absurdres, "
+NEGATIVE_PROMPT = """lowres, bad anatomy, error body, error hair, error arm,
+error hands, bad hands, error fingers, bad fingers, missing fingers
+error legs, bad legs, multiple legs, missing legs, error lighting,
+error shadow, error reflection, text, error, extra digit, fewer digits,
+cropped, worst quality, low quality, normal quality, jpeg artifacts,
+signature, watermark, username, blurry"""

data/models/coqui/.placeholder ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Put Coqui models folders here.
2	+ Must contains both a "model.pth" and "config.json" file.

data/models/rvc/.placeholder ADDED Viewed

	@@ -0,0 +1,3 @@

+Put RVC models folder here.
+Must have ".pth" file in it
+.index file is optional but could help improve the processing time/quality.

data/tmp/.placeholder ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ This is a temporary file folder.
2	+ May contain RVC input/output file for research purpose.

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+EXPOSE 5100
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python3 python3-venv wget build-essential
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+RUN conda --version
+RUN conda init
+RUN conda create -n extras
+RUN /bin/bash -c "source activate extras"
+RUN conda install pytorch torchvision torchaudio pytorch-cuda=11.7 git -c pytorch -c nvidia -c conda-forge
+WORKDIR /sillytavern-extras/
+COPY . .
+ARG REQUIREMENTS
+RUN pip install -r $REQUIREMENTS
+ARG MODULES
+CMD ["python","server.py","--enable-modules=$MODULES"]

docker/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+version: "3"
+services:
+  sillytavern-extras:
+    runtime: nvidia
+    image: cohee1207/sillytavern-extras
+    build:
+      context: ../
+      dockerfile: docker/Dockerfile
+      args:
+        REQUIREMENTS: requirements.txt
+        MODULES: caption,summarize,classify
+#        REQUIREMENTS: requirements-complete.txt
+#        MODULES: caption,summarize,classify,sd,silero-tts,edge-tts,chromadb
+    volumes:
+      #- "./chromadb:/chromadb"
+      - "./cache:/root/.cache"
+      - "./api_key.txt:/sillytavern-extras/api_key.txt:rw"
+    ports:
+      - "5100:5100"
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+    command: python server.py --enable-modules=caption,summarize,classify
+#    command: python server.py --enable-modules=caption,summarize,classify,sd,silero-tts,edge-tts,chromadb

docker/readme.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Docker Usage
+## Building the image
+*This is assuming you have docker and docker compose installed and running.*
+1. Open a terminal and set your current directory to the "docker" directory in your clone of this repo.
+2. Adjust the "docker-compose.yml" file to match your needs. The default selection and the selection with all modules are provided as examples.
+3. Once ready, run the command "docker compose build" to build the "cohee1207/sillytavern-extras" docker image.

modules/classify/classify_module.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Classify module for SillyTavern Extras
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+    - Cohee (https://github.com/Cohee1207)
+Provides classification features for text
+References:
+    - https://huggingface.co/tasks/text-classification
+"""
+from transformers import pipeline
+DEBUG_PREFIX = "<Classify module>"
+# Models init
+text_emotion_pipe = None
+def init_text_emotion_classifier(model_name: str, device: str, torch_dtype: str) -> None:
+    global text_emotion_pipe
+    print(DEBUG_PREFIX,"Initializing text classification pipeline with model",model_name)
+    text_emotion_pipe = pipeline(
+            "text-classification",
+            model=model_name,
+            top_k=None,
+            device=device,
+            torch_dtype=torch_dtype,
+        )
+def classify_text_emotion(text: str) -> list:
+    output = text_emotion_pipe(
+        text,
+        truncation=True,
+        max_length=text_emotion_pipe.model.config.max_position_embeddings,
+    )[0]
+    return sorted(output, key=lambda x: x["score"], reverse=True)

modules/speech_recognition/streaming_module.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Speech-to-text module based on Vosk and Whisper for SillyTavern Extras
+    - Vosk website: https://alphacephei.com/vosk/
+    - Vosk api: https://github.com/alphacep/vosk-api
+    - Whisper github: https://github.com/openai/whisper
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper and C:/Users/toto/.cache/vosk
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+        - vosk github: https://github.com/alphacep/vosk-api/blob/master/python/example/test_microphone.py
+"""
+from flask import jsonify, abort
+import queue
+import sys
+import sounddevice as sd
+import soundfile as sf
+import io
+import numpy as np
+from scipy.io.wavfile import write
+import vosk
+import whisper
+DEBUG_PREFIX = "<stt streaming module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+whisper_model = None
+vosk_model = None
+device = None
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+    if file_path is None:
+        return (whisper.load_model("base.en"), vosk.Model(lang="en-us"))
+    else:
+        return (whisper.load_model(file_path), vosk.Model(lang="en-us"))
+def convert_bytearray_to_wav_ndarray(input_bytearray: bytes, sampling_rate=16000):
+    """
+    Convert a bytearray to wav format to output in a file for quality check debuging
+    """
+    bytes_wav = bytes()
+    byte_io = io.BytesIO(bytes_wav)
+    write(byte_io, sampling_rate, np.frombuffer(input_bytearray, dtype=np.int16))
+    output_wav = byte_io.read()
+    output, _ = sf.read(io.BytesIO(output_wav))
+    return output
+def record_and_transcript():
+    """
+    Continuously record from mic and transcript voice.
+    Return the transcript once no more voice is detected.
+    """
+    if whisper_model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return ""
+    q = queue.Queue()
+    stream_errors = list()
+    def callback(indata, frames, time, status):
+        """This is called (from a separate thread) for each audio block."""
+        if status:
+            print(status, file=sys.stderr)
+            stream_errors.append(status)
+        q.put(bytes(indata))
+    try:
+        device_info = sd.query_devices(device, "input")
+        # soundfile expects an int, sounddevice provides a float:
+        samplerate = int(device_info["default_samplerate"])
+        print(DEBUG_PREFIX, "Start recording from:", device_info["name"], "with samplerate", samplerate)
+        with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=device, dtype="int16", channels=1, callback=callback):
+            rec = vosk.KaldiRecognizer(vosk_model, samplerate)
+            full_recording = bytearray()
+            while True:
+                data = q.get()
+                if len(stream_errors) > 0:
+                    raise Exception(DEBUG_PREFIX+" Stream errors: "+str(stream_errors))
+                full_recording.extend(data)
+                if rec.AcceptWaveform(data):
+                    # Extract transcript string
+                    transcript = rec.Result()[14:-3]
+                    print(DEBUG_PREFIX, "Transcripted from microphone stream (vosk):", transcript)
+                    # ----------------------------------
+                    # DEBUG: save recording to wav file
+                    # ----------------------------------
+                    output_file = convert_bytearray_to_wav_ndarray(input_bytearray=full_recording, sampling_rate=samplerate)
+                    sf.write(file=RECORDING_FILE_PATH, data=output_file, samplerate=samplerate)
+                    print(DEBUG_PREFIX, "Recorded message saved to", RECORDING_FILE_PATH)
+                    # Whisper HACK
+                    result = whisper_model.transcribe(RECORDING_FILE_PATH)
+                    transcript = result["text"]
+                    print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+                    # ----------------------------------
+                    return jsonify({"transcript": transcript})
+                #else:
+                #    print(rec.PartialResult())
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while recording")

modules/speech_recognition/vosk_module.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Speech-to-text module based on Vosk for SillyTavern Extras
+    - Vosk website: https://alphacephei.com/vosk/
+    - Vosk api: https://github.com/alphacep/vosk-api
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+Models are saved into user cache folder, example: C:/Users/toto/.cache/vosk
+References:
+    - Code adapted from: https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py
+"""
+from flask import jsonify, abort, request
+import wave
+from vosk import Model, KaldiRecognizer, SetLogLevel
+import soundfile
+DEBUG_PREFIX = "<stt vosk module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+model = None
+SetLogLevel(-1)
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+    if file_path is None:
+        return Model(lang="en-us")
+    else:
+        return Model(file_path)
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+    if model is None:
+        print(DEBUG_PREFIX,"Vosk model not initialized yet.")
+        return ""
+    try:
+        file = request.files.get('AudioFile')
+        file.save(RECORDING_FILE_PATH)
+        # Read and rewrite the file with soundfile
+        data, samplerate = soundfile.read(RECORDING_FILE_PATH)
+        soundfile.write(RECORDING_FILE_PATH, data, samplerate)
+        wf = wave.open(RECORDING_FILE_PATH, "rb")
+        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+            print("Audio file must be WAV format mono PCM.")
+            abort(500, DEBUG_PREFIX+" Audio file must be WAV format mono PCM.")
+        rec = KaldiRecognizer(model, wf.getframerate())
+        #rec.SetWords(True)
+        #rec.SetPartialWords(True)
+        while True:
+            data = wf.readframes(4000)
+            if len(data) == 0:
+                break
+            if rec.AcceptWaveform(data):
+                break
+        transcript = rec.Result()[14:-3]
+        print(DEBUG_PREFIX, "Transcripted from request audio file:", transcript)
+        return jsonify({"transcript": transcript})
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

modules/speech_recognition/whisper_module.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+import whisper
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+model = None
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+    if file_path is None:
+        return whisper.load_model("base.en")
+    else:
+        return whisper.load_model(file_path)
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return ""
+    try:
+        file = request.files.get('AudioFile')
+        file.save(RECORDING_FILE_PATH)
+        result = model.transcribe(RECORDING_FILE_PATH)
+        transcript = result["text"]
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+        return jsonify({"transcript": transcript})
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

modules/text_to_speech/coqui/coqui_module.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Coqui module for SillyTavern Extras
+Authors:
+    - Pyrater (https://github.com/pyrater)
+    - Tony Ribeiro (https://github.com/Tony-sama)
+Models are saved into user cache folder: "C:/Users/<username>/AppData/Local/tts"
+References:
+    - Code adapted from:
+        - Coqui TTS https://tts.readthedocs.io/en/latest/
+        - Audio-webui: https://github.com/gitmylo/audio-webui
+"""
+import json
+import os
+import io
+import shutil
+from flask import abort, request, send_file, jsonify
+from TTS.api import TTS
+from TTS.utils.manage import ModelManager
+from modules.utils import silence_log
+DEBUG_PREFIX = "<Coqui-TTS module>"
+COQUI_MODELS_PATH = "data/models/coqui/"
+IGNORED_FILES = [".placeholder"]
+COQUI_LOCAL_MODEL_FILE_NAME = "model.pth"
+COQUI_LOCAL_CONFIG_FILE_NAME = "config.json"
+gpu_mode = False
+is_downloading = False
+def install_model(model_id):
+    global gpu_mode
+    audio_buffer = io.BytesIO()
+    speaker_id = None
+    language_id = None
+    print(DEBUG_PREFIX,"Loading model",model_id)
+    try:
+        tts = TTS(model_name=model_id, progress_bar=True, gpu=gpu_mode)
+        if tts.is_multi_lingual:
+            language_id = tts.languages[0]
+        if tts.is_multi_speaker:
+            speaker_id =tts.speakers[0]
+        tts.tts_to_file(text="this is a test message", file_path=audio_buffer, speaker=speaker_id, language=language_id)
+    except Exception as e:
+        print(DEBUG_PREFIX,"ERROR:", e)
+        print("Model", model_id, "cannot be loaded, maybe wrong model name? Must be one of")
+        for i in TTS.list_models():
+            print(i)
+        return False
+    print(DEBUG_PREFIX,"Success")
+    return True
+def coqui_check_model_state():
+    """
+        Check if the requested model is installed on the server machine
+    """
+    try:
+        model_state = "absent"
+        request_json = request.get_json()
+        model_id = request_json["model_id"]
+        print(DEBUG_PREFIX,"Search for model", model_id)
+        coqui_models_folder = ModelManager().output_prefix  # models location
+        # Check if tts folder exist
+        if os.path.isdir(coqui_models_folder):
+            installed_models = os.listdir(coqui_models_folder)
+            model_folder_exists = False
+            model_folder = None
+            for i in installed_models:
+                if model_id == i.replace("--","/",3): # Error with model wrong name
+                    model_folder_exists = True
+                    model_folder = i
+                    print(DEBUG_PREFIX,"Folder found:",model_folder)
+            # Check failed download
+            if model_folder_exists:
+                content = os.listdir(os.path.join(coqui_models_folder,model_folder))
+                print(DEBUG_PREFIX,"Checking content:",content)
+                for i in content:
+                    if i == model_folder+".zip":
+                        print("Corrupt installed found, model download must have failed previously")
+                        model_state = "corrupted"
+                        break
+                if model_state != "corrupted":
+                    model_state = "installed"
+        response = json.dumps({"model_state":model_state})
+        return response
+    except Exception as e:
+        print(e)
+        abort(500, DEBUG_PREFIX + " Exception occurs while trying to search for installed model")
+def coqui_install_model():
+    """
+        Install requested model is installed on the server machine
+    """
+    global gpu_mode
+    global is_downloading
+    try:
+        model_installed = False
+        request_json = request.get_json()
+        model_id = request_json["model_id"]
+        action = request_json["action"]
+        print(DEBUG_PREFIX,"Received request",action,"for model",model_id)
+        if (is_downloading):
+            print(DEBUG_PREFIX,"Rejected, already downloading a model")
+            return json.dumps({"status":"downloading"})
+        coqui_models_folder = ModelManager().output_prefix  # models location
+        # Check if tts folder exist
+        if os.path.isdir(coqui_models_folder):
+            installed_models = os.listdir(coqui_models_folder)
+            model_path = None
+            print(DEBUG_PREFIX,"Found",len(installed_models),"models in",coqui_models_folder)
+            for i in installed_models:
+                if model_id == i.replace("--","/"):
+                    model_installed = True
+                    model_path = os.path.join(coqui_models_folder,i)
+            if model_installed:
+                print(DEBUG_PREFIX,"model found:", model_id)
+            else:
+                print(DEBUG_PREFIX,"model not found")
+            if action == "download":
+                if model_installed:
+                    abort(500, DEBUG_PREFIX + "Bad request, model already installed.")
+                is_downloading = True
+                TTS(model_name=model_id, progress_bar=True, gpu=gpu_mode)
+                is_downloading = False
+            if action == "repare":
+                if not model_installed:
+                    abort(500, DEBUG_PREFIX + " bad request: requesting repare of model not installed")
+                print(DEBUG_PREFIX,"Deleting corrupted model folder:",model_path)
+                shutil.rmtree(model_path, ignore_errors=True)
+        is_downloading = True
+        TTS(model_name=model_id, progress_bar=True, gpu=gpu_mode)
+        is_downloading = False
+        response = json.dumps({"status":"done"})
+        return response
+    except Exception as e:
+        is_downloading = False
+        print(e)
+        abort(500, DEBUG_PREFIX + " Exception occurs while trying to search for installed model")
+def coqui_get_local_models():
+    """
+    Return user local models list in the following format: [language][dataset][name] = TTS_string_id
+    """
+    try:
+        print(DEBUG_PREFIX, "Received request for list of RVC models")
+        folder_names = os.listdir(COQUI_MODELS_PATH)
+        print(DEBUG_PREFIX,"Searching model in",COQUI_MODELS_PATH)
+        model_list = []
+        for folder_name in folder_names:
+            folder_path = COQUI_MODELS_PATH+folder_name
+            if folder_name in IGNORED_FILES:
+                continue
+            # Must be a folder
+            if not os.path.isdir(folder_path):
+                print("> WARNING:",folder_name,"is not a folder, it should not be there, ignored")
+                continue
+            print("> Found model folder",folder_name)
+            # Check pth
+            valid_folder = False
+            for file_name in os.listdir(folder_path):
+                if file_name.endswith(".pth"):
+                    print(" > pth:",file_name)
+                    valid_folder = True
+                if file_name.endswith(".config"):
+                    print(" > config:",file_name)
+            if valid_folder:
+                print(" > Valid folder added to list")
+                model_list.append(folder_name)
+            else:
+                print(" > WARNING: Missing pth or config file, ignored folder")
+        # Return the list of valid folders
+        response = json.dumps({"models_list":model_list})
+        return response
+    except Exception as e:
+        print(e)
+        abort(500, DEBUG_PREFIX + " Exception occurs while searching for Coqui models.")
+def coqui_generate_tts():
+    """
+    Process request text with the loaded RVC model
+        - expected request: {
+            "text": text,
+            "model_id": voiceId,
+            "language_id": language,
+            "speaker_id": speaker
+        }
+        - model_id formats:
+            - model_type/language/dataset/model_name
+            - model_type/language/dataset/model_name[spearker_id]
+            - model_type/language/dataset/model_name[spearker_id][language_id]
+        - examples:
+            - tts_models/ja/kokoro/tacotron2-DDC
+            - tts_models/en/vctk/vits[0]
+            - tts_models/multilingual/multi-dataset/your_tts[2][1]
+    """
+    global gpu_mode
+    global is_downloading
+    audio_buffer = io.BytesIO()
+    try:
+        request_json = request.get_json()
+        #print(request_json)
+        print(DEBUG_PREFIX,"Received TTS request for ", request_json)
+        if (is_downloading):
+            print(DEBUG_PREFIX,"Rejected, currently downloading a model, cannot perform TTS")
+            abort(500, DEBUG_PREFIX + " Requested TTS while downloading a model")
+        text = request_json["text"]
+        model_name = request_json["model_id"]
+        language_id = None
+        speaker_id =  None
+        # Local model
+        model_type = model_name.split("/")[0]
+        if model_type == "local":
+            return generate_tts_local(model_name.split("/")[1], text)
+        if request_json["language_id"] != "none":
+            language_id = request_json["language_id"]
+        if request_json["speaker_id"] != "none":
+            speaker_id = request_json["speaker_id"]
+        print(DEBUG_PREFIX,"Loading tts \n- model", model_name, "\n - speaker_id: ",speaker_id,"\n - language_id: ",language_id, "\n - using",("GPU" if gpu_mode else "CPU"))
+        is_downloading = True
+        tts = TTS(model_name=model_name, progress_bar=True, gpu=gpu_mode)
+        is_downloading = False
+        if tts.is_multi_lingual:
+            if language_id is None:
+                abort(400, DEBUG_PREFIX + " Requested model "+model_name+" is multi-lingual but no language id provided")
+            language_id = tts.languages[int(language_id)]
+        if tts.is_multi_speaker:
+            if speaker_id is None:
+                abort(400, DEBUG_PREFIX + " Requested model "+model_name+" is multi-speaker but no speaker id provided")
+            speaker_id =tts.speakers[int(speaker_id)]
+        tts.tts_to_file(text=text, file_path=audio_buffer, speaker=speaker_id, language=language_id)
+        print(DEBUG_PREFIX, "Success, saved to",audio_buffer)
+        # Return the output_audio_path object as a response
+        response = send_file(audio_buffer, mimetype="audio/x-wav")
+        audio_buffer = io.BytesIO()
+        return response
+    except Exception as e:
+        print(e)
+        abort(500, DEBUG_PREFIX + " Exception occurs while trying to process request "+str(request_json))
+def generate_tts_local(model_folder, text):
+    """
+    Generate tts using local coqui model
+    """
+    audio_buffer = io.BytesIO()
+    print(DEBUG_PREFIX,"Request for tts from local coqui model",model_folder)
+    model_path = os.path.join(COQUI_MODELS_PATH,model_folder,COQUI_LOCAL_MODEL_FILE_NAME)
+    config_path = os.path.join(COQUI_MODELS_PATH,model_folder,COQUI_LOCAL_CONFIG_FILE_NAME)
+    if not os.path.exists(model_path):
+        raise ValueError("File does not exists:",model_path)
+    if not os.path.exists(config_path):
+        raise ValueError("File does not exists:",config_path)
+    print(DEBUG_PREFIX,"Loading local tts model", model_path,"using",("GPU" if gpu_mode else "CPU"))
+    tts = TTS(model_path=model_path, config_path=config_path, progress_bar=True, gpu=gpu_mode)
+    tts.tts_to_file(text=text, file_path=audio_buffer)
+    print(DEBUG_PREFIX, "Success, saved to",audio_buffer)
+    # Return the output_audio_path object as a response
+    response = send_file(audio_buffer, mimetype="audio/x-wav")
+    audio_buffer = io.BytesIO()
+    return response

modules/utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from contextlib import contextmanager
+import sys
+@contextmanager
+def silence_log():
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
+    try:
+        with open(os.devnull, "w") as new_target:
+            sys.stdout = new_target
+            yield new_target
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr

modules/voice_conversion/fairseq/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

modules/voice_conversion/fairseq/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+import os
+import sys
+try:
+    from .version import __version__  # noqa
+except ImportError:
+    version_txt = os.path.join(os.path.dirname(__file__), "version.txt")
+    with open(version_txt) as f:
+        __version__ = f.read().strip()
+__all__ = ["pdb"]
+# backwards compatibility to support `from fairseq.X import Y`
+from fairseq.distributed import utils as distributed_utils
+from fairseq.logging import meters, metrics, progress_bar  # noqa
+sys.modules["fairseq.distributed_utils"] = distributed_utils
+sys.modules["fairseq.meters"] = meters
+sys.modules["fairseq.metrics"] = metrics
+sys.modules["fairseq.progress_bar"] = progress_bar
+# initialize hydra
+#from fairseq.dataclass.initialize import hydra_init
+#hydra_init()
+#import fairseq.criterions  # noqa
+#import fairseq.distributed  # noqa
+#import fairseq.models  # noqa
+#import fairseq.modules  # noqa
+#import fairseq.optim  # noqa
+#import fairseq.optim.lr_scheduler  # noqa
+#import fairseq.pdb  # noqa
+#import fairseq.scoring  # noqa
+#import fairseq.tasks  # noqa
+#import fairseq.token_generation_constraints  # noqa
+#import fairseq.benchmark  # noqa
+#import fairseq.model_parallel  # noqa

modules/voice_conversion/fairseq/binarizer.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import typing as tp
+from abc import ABC, abstractmethod
+from collections import Counter
+from dataclasses import dataclass
+from multiprocessing import Pool
+import torch
+from fairseq.data import Dictionary, indexed_dataset
+from fairseq.file_chunker_utils import Chunker, find_offsets
+from fairseq.file_io import PathManager
+from fairseq.tokenizer import tokenize_line
+logger = logging.getLogger("binarizer")
+@dataclass
+class BinarizeSummary:
+    """
+    Keep track of what's going on in the binarizer
+    """
+    num_seq: int = 0
+    replaced: tp.Optional[Counter] = None
+    num_tok: int = 0
+    @property
+    def num_replaced(self) -> int:
+        if self.replaced is None:
+            return 0
+        return sum(self.replaced.values())
+    @property
+    def replaced_percent(self) -> float:
+        return 100 * self.num_replaced / self.num_tok
+    def __str__(self) -> str:
+        base = f"{self.num_seq} sents, {self.num_tok} tokens"
+        if self.replaced is None:
+            return base
+        return f"{base}, {self.replaced_percent:.3}% replaced"
+    def merge(self, other: "BinarizeSummary"):
+        replaced = None
+        if self.replaced is not None:
+            replaced = self.replaced
+        if other.replaced is not None:
+            if replaced is None:
+                replaced = other.replaced
+            else:
+                replaced += other.replaced
+        self.replaced = replaced
+        self.num_seq += other.num_seq
+        self.num_tok += other.num_tok
+class Binarizer(ABC):
+    """
+    a binarizer describes how to take a string and build a tensor out of it
+    """
+    @abstractmethod
+    def binarize_line(
+        self,
+        line: str,
+        summary: BinarizeSummary,
+    ) -> torch.IntTensor:
+        ...
+def _worker_prefix(output_prefix: str, worker_id: int):
+    return f"{output_prefix}.pt{worker_id}"
+class FileBinarizer:
+    """
+    An file binarizer can take a file, tokenize it, and binarize each line to a tensor
+    """
+    @classmethod
+    def multiprocess_dataset(
+        cls,
+        input_file: str,
+        dataset_impl: str,
+        binarizer: Binarizer,
+        output_prefix: str,
+        vocab_size=None,
+        num_workers=1,
+    ) -> BinarizeSummary:
+        final_summary = BinarizeSummary()
+        offsets = find_offsets(input_file, num_workers)
+        # find_offsets returns a list of position [pos1, pos2, pos3, pos4] but we would want pairs:
+        # [(pos1, pos2), (pos2, pos3), (pos3, pos4)] to process the chunks with start/end info
+        # we zip the list with itself shifted by one to get all the pairs.
+        (first_chunk, *more_chunks) = zip(offsets, offsets[1:])
+        pool = None
+        if num_workers > 1:
+            pool = Pool(processes=num_workers - 1)
+            worker_results = [
+                pool.apply_async(
+                    cls._binarize_chunk_and_finalize,
+                    args=(
+                        binarizer,
+                        input_file,
+                        start_offset,
+                        end_offset,
+                        _worker_prefix(
+                            output_prefix,
+                            worker_id,
+                        ),
+                        dataset_impl,
+                    ),
+                    kwds={
+                        "vocab_size": vocab_size,
+                    }
+                    if vocab_size is not None
+                    else {},
+                )
+                for worker_id, (start_offset, end_offset) in enumerate(
+                    more_chunks, start=1
+                )
+            ]
+            pool.close()
+            pool.join()
+            for r in worker_results:
+                summ = r.get()
+                final_summary.merge(summ)
+        # do not close the bin file as we need to merge the worker results in
+        final_ds, summ = cls._binarize_file_chunk(
+            binarizer,
+            input_file,
+            offset_start=first_chunk[0],
+            offset_end=first_chunk[1],
+            output_prefix=output_prefix,
+            dataset_impl=dataset_impl,
+            vocab_size=vocab_size if vocab_size is not None else None,
+        )
+        final_summary.merge(summ)
+        if num_workers > 1:
+            for worker_id in range(1, num_workers):
+                # merge the worker outputs
+                worker_output_prefix = _worker_prefix(
+                    output_prefix,
+                    worker_id,
+                )
+                final_ds.merge_file_(worker_output_prefix)
+                try:
+                    os.remove(indexed_dataset.data_file_path(worker_output_prefix))
+                    os.remove(indexed_dataset.index_file_path(worker_output_prefix))
+                except Exception as e:
+                    logger.error(
+                        f"couldn't remove {worker_output_prefix}.*", exc_info=e
+                    )
+        #  now we can close the file
+        idx_file = indexed_dataset.index_file_path(output_prefix)
+        final_ds.finalize(idx_file)
+        return final_summary
+    @staticmethod
+    def _binarize_file_chunk(
+        binarizer: Binarizer,
+        filename: str,
+        offset_start: int,
+        offset_end: int,
+        output_prefix: str,
+        dataset_impl: str,
+        vocab_size=None,
+    ) -> tp.Tuple[tp.Any, BinarizeSummary]:  # (dataset builder, BinarizeSummary)
+        """
+        creates a dataset builder and append binarized items to it. This function does not
+        finalize the builder, this is useful if you want to do other things with your bin file
+        like appending/merging other files
+        """
+        bin_file = indexed_dataset.data_file_path(output_prefix)
+        ds = indexed_dataset.make_builder(
+            bin_file,
+            impl=dataset_impl,
+            vocab_size=vocab_size,
+        )
+        summary = BinarizeSummary()
+        with Chunker(
+            PathManager.get_local_path(filename), offset_start, offset_end
+        ) as line_iterator:
+            for line in line_iterator:
+                ds.add_item(binarizer.binarize_line(line, summary))
+        return ds, summary
+    @classmethod
+    def _binarize_chunk_and_finalize(
+        cls,
+        binarizer: Binarizer,
+        filename: str,
+        offset_start: int,
+        offset_end: int,
+        output_prefix: str,
+        dataset_impl: str,
+        vocab_size=None,
+    ):
+        """
+        same as above, but also finalizes the builder
+        """
+        ds, summ = cls._binarize_file_chunk(
+            binarizer,
+            filename,
+            offset_start,
+            offset_end,
+            output_prefix,
+            dataset_impl,
+            vocab_size=vocab_size,
+        )
+        idx_file = indexed_dataset.index_file_path(output_prefix)
+        ds.finalize(idx_file)
+        return summ
+class VocabularyDatasetBinarizer(Binarizer):
+    """
+    Takes a Dictionary/Vocabulary, assign ids to each
+    token using the dictionary encode_line function.
+    """
+    def __init__(
+        self,
+        dict: Dictionary,
+        tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line,
+        append_eos: bool = True,
+        reverse_order: bool = False,
+        already_numberized: bool = False,
+    ) -> None:
+        self.dict = dict
+        self.tokenize = tokenize
+        self.append_eos = append_eos
+        self.reverse_order = reverse_order
+        self.already_numberized = already_numberized
+        super().__init__()
+    def binarize_line(
+        self,
+        line: str,
+        summary: BinarizeSummary,
+    ):
+        if summary.replaced is None:
+            summary.replaced = Counter()
+        def replaced_consumer(word, idx):
+            if idx == self.dict.unk_index and word != self.dict.unk_word:
+                summary.replaced.update([word])
+        if self.already_numberized:
+            id_strings = line.strip().split()
+            id_list = [int(id_string) for id_string in id_strings]
+            if self.reverse_order:
+                id_list.reverse()
+            if self.append_eos:
+                id_list.append(self.dict.eos())
+            ids = torch.IntTensor(id_list)
+        else:
+            ids = self.dict.encode_line(
+                line=line,
+                line_tokenizer=self.tokenize,
+                add_if_not_exist=False,
+                consumer=replaced_consumer,
+                append_eos=self.append_eos,
+                reverse_order=self.reverse_order,
+            )
+        summary.num_seq += 1
+        summary.num_tok += len(ids)
+        return ids
+class AlignmentDatasetBinarizer(Binarizer):
+    """
+    binarize by parsing a set of alignments and packing
+    them in a tensor (see utils.parse_alignment)
+    """
+    def __init__(
+        self,
+        alignment_parser: tp.Callable[[str], torch.IntTensor],
+    ) -> None:
+        super().__init__()
+        self.alignment_parser = alignment_parser
+    def binarize_line(
+        self,
+        line: str,
+        summary: BinarizeSummary,
+    ):
+        ids = self.alignment_parser(line)
+        summary.num_seq += 1
+        summary.num_tok += len(ids)
+        return ids
+class LegacyBinarizer:
+    @classmethod
+    def binarize(
+        cls,
+        filename: str,
+        dico: Dictionary,
+        consumer: tp.Callable[[torch.IntTensor], None],
+        tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line,
+        append_eos: bool = True,
+        reverse_order: bool = False,
+        offset: int = 0,
+        end: int = -1,
+        already_numberized: bool = False,
+    ) -> tp.Dict[str, int]:
+        binarizer = VocabularyDatasetBinarizer(
+            dict=dico,
+            tokenize=tokenize,
+            append_eos=append_eos,
+            reverse_order=reverse_order,
+            already_numberized=already_numberized,
+        )
+        return cls._consume_file(
+            filename,
+            binarizer,
+            consumer,
+            offset_start=offset,
+            offset_end=end,
+        )
+    @classmethod
+    def binarize_alignments(
+        cls,
+        filename: str,
+        alignment_parser: tp.Callable[[str], torch.IntTensor],
+        consumer: tp.Callable[[torch.IntTensor], None],
+        offset: int = 0,
+        end: int = -1,
+    ) -> tp.Dict[str, int]:
+        binarizer = AlignmentDatasetBinarizer(alignment_parser)
+        return cls._consume_file(
+            filename,
+            binarizer,
+            consumer,
+            offset_start=offset,
+            offset_end=end,
+        )
+    @staticmethod
+    def _consume_file(
+        filename: str,
+        binarizer: Binarizer,
+        consumer: tp.Callable[[torch.IntTensor], None],
+        offset_start: int,
+        offset_end: int,
+    ) -> tp.Dict[str, int]:
+        summary = BinarizeSummary()
+        with Chunker(
+            PathManager.get_local_path(filename), offset_start, offset_end
+        ) as line_iterator:
+            for line in line_iterator:
+                consumer(binarizer.binarize_line(line, summary))
+        return {
+            "nseq": summary.num_seq,
+            "nunk": summary.num_replaced,
+            "ntok": summary.num_tok,
+            "replaced": summary.replaced,
+        }

modules/voice_conversion/fairseq/checkpoint_utils.py ADDED Viewed

	@@ -0,0 +1,905 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import ast
+import collections
+import contextlib
+import inspect
+import logging
+import os
+import re
+import time
+import traceback
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import numpy as np
+import torch
+from fairseq.data import data_utils
+from fairseq.dataclass.configs import CheckpointConfig
+from fairseq.dataclass.utils import (
+    convert_namespace_to_omegaconf,
+    overwrite_args_by_name,
+)
+from fairseq.distributed.fully_sharded_data_parallel import FSDP, has_FSDP
+from fairseq.file_io import PathManager
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from omegaconf import DictConfig, OmegaConf, open_dict
+logger = logging.getLogger(__name__)
+def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
+    from fairseq import meters
+    # only one worker should attempt to create the required dir
+    if trainer.data_parallel_rank == 0:
+        os.makedirs(cfg.save_dir, exist_ok=True)
+    prev_best = getattr(save_checkpoint, "best", val_loss)
+    if val_loss is not None:
+        best_function = max if cfg.maximize_best_checkpoint_metric else min
+        save_checkpoint.best = best_function(val_loss, prev_best)
+    if cfg.no_save:
+        return
+    trainer.consolidate_optimizer()  # TODO(SS): do we need this if no_save_optimizer_state
+    if not trainer.should_save_checkpoint_on_current_rank:
+        if trainer.always_call_state_dict_during_save_checkpoint:
+            trainer.state_dict()
+        return
+    write_timer = meters.StopwatchMeter()
+    write_timer.start()
+    epoch = epoch_itr.epoch
+    end_of_epoch = epoch_itr.end_of_epoch()
+    updates = trainer.get_num_updates()
+    logger.info(f"Preparing to save checkpoint for epoch {epoch} @ {updates} updates")
+    def is_better(a, b):
+        return a >= b if cfg.maximize_best_checkpoint_metric else a <= b
+    suffix = trainer.checkpoint_suffix
+    checkpoint_conds = collections.OrderedDict()
+    checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
+        end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
+    )
+    checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
+        not end_of_epoch
+        and cfg.save_interval_updates > 0
+        and updates % cfg.save_interval_updates == 0
+    )
+    checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
+        not hasattr(save_checkpoint, "best")
+        or is_better(val_loss, save_checkpoint.best)
+    )
+    if val_loss is not None and cfg.keep_best_checkpoints > 0:
+        worst_best = getattr(save_checkpoint, "best", None)
+        chkpts = checkpoint_paths(
+            cfg.save_dir,
+            pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format(
+                cfg.best_checkpoint_metric, suffix
+            ),
+        )
+        if len(chkpts) > 0:
+            p = chkpts[-1] if cfg.maximize_best_checkpoint_metric else chkpts[0]
+            worst_best = float(p.rsplit("_")[-1].replace("{}.pt".format(suffix), ""))
+        # add random digits to resolve ties
+        with data_utils.numpy_seed(epoch, updates, val_loss):
+            rand_sfx = np.random.randint(0, cfg.keep_best_checkpoints)
+        checkpoint_conds[
+            "checkpoint.best_{}_{:.3f}{}{}.pt".format(
+                cfg.best_checkpoint_metric, val_loss, rand_sfx, suffix
+            )
+        ] = worst_best is None or is_better(val_loss, worst_best)
+    checkpoint_conds[
+        "checkpoint_last{}.pt".format(suffix)
+    ] = not cfg.no_last_checkpoints
+    extra_state = {"train_iterator": epoch_itr.state_dict(), "val_loss": val_loss}
+    if hasattr(save_checkpoint, "best"):
+        extra_state.update({"best": save_checkpoint.best})
+    checkpoints = [
+        os.path.join(cfg.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond
+    ]
+    if len(checkpoints) > 0 and trainer.should_save_checkpoint_on_current_rank:
+        trainer.save_checkpoint(checkpoints[0], extra_state)
+        for cp in checkpoints[1:]:
+            if cfg.write_checkpoints_asynchronously:
+                # TODO[ioPath]: Need to implement a delayed asynchronous
+                # file copying/moving feature.
+                logger.warning(
+                    f"ioPath is not copying {checkpoints[0]} to {cp} "
+                    "since async write mode is on."
+                )
+            else:
+                assert PathManager.copy(
+                    checkpoints[0], cp, overwrite=True
+                ), f"Failed to copy {checkpoints[0]} to {cp}"
+        write_timer.stop()
+        logger.info(
+            "Saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format(
+                checkpoints[0], epoch, updates, val_loss, write_timer.sum
+            )
+        )
+    if not end_of_epoch and cfg.keep_interval_updates > 0:
+        # remove old checkpoints; checkpoints are sorted in descending order
+        if cfg.keep_interval_updates_pattern == -1:
+            checkpoints = checkpoint_paths(
+                cfg.save_dir, pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix)
+            )
+        else:
+            checkpoints = checkpoint_paths(
+                cfg.save_dir,
+                pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix),
+                keep_match=True,
+            )
+            checkpoints = [
+                x[0]
+                for x in checkpoints
+                if x[1] % cfg.keep_interval_updates_pattern != 0
+            ]
+        for old_chk in checkpoints[cfg.keep_interval_updates :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+    if cfg.keep_last_epochs > 0:
+        # remove old epoch checkpoints; checkpoints are sorted in descending order
+        checkpoints = checkpoint_paths(
+            cfg.save_dir, pattern=r"checkpoint(\d+){}\.pt".format(suffix)
+        )
+        for old_chk in checkpoints[cfg.keep_last_epochs :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+    if cfg.keep_best_checkpoints > 0:
+        # only keep the best N checkpoints according to validation metric
+        checkpoints = checkpoint_paths(
+            cfg.save_dir,
+            pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format(
+                cfg.best_checkpoint_metric, suffix
+            ),
+        )
+        if not cfg.maximize_best_checkpoint_metric:
+            checkpoints = checkpoints[::-1]
+        for old_chk in checkpoints[cfg.keep_best_checkpoints :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
+    """
+    Load a checkpoint and restore the training iterator.
+    *passthrough_args* will be passed through to
+    ``trainer.get_train_iterator``.
+    """
+    reset_optimizer = cfg.reset_optimizer
+    reset_lr_scheduler = cfg.reset_lr_scheduler
+    optimizer_overrides = ast.literal_eval(cfg.optimizer_overrides)
+    reset_meters = cfg.reset_meters
+    reset_dataloader = cfg.reset_dataloader
+    if cfg.finetune_from_model is not None and (
+        reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
+    ):
+        raise ValueError(
+            "--finetune-from-model can not be set together with either --reset-optimizer"
+            " or reset_lr_scheduler or reset_meters or reset_dataloader"
+        )
+    suffix = trainer.checkpoint_suffix
+    if (
+        cfg.restore_file == "checkpoint_last.pt"
+    ):  # default value of restore_file is 'checkpoint_last.pt'
+        checkpoint_path = os.path.join(
+            cfg.save_dir, "checkpoint_last{}.pt".format(suffix)
+        )
+        first_launch = not PathManager.exists(checkpoint_path)
+        if first_launch and getattr(cfg, "continue_once", None) is not None:
+            checkpoint_path = cfg.continue_once
+        elif cfg.finetune_from_model is not None and first_launch:
+            # if there is no last checkpoint to restore, start the finetune from pretrained model
+            # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc.
+            if PathManager.exists(cfg.finetune_from_model):
+                checkpoint_path = cfg.finetune_from_model
+                reset_optimizer = True
+                reset_lr_scheduler = True
+                reset_meters = True
+                reset_dataloader = True
+                logger.info(
+                    f"loading pretrained model from {checkpoint_path}: "
+                    "optimizer, lr scheduler, meters, dataloader will be reset"
+                )
+            else:
+                raise ValueError(
+                    f"--finetune-from-model {cfg.finetune_from_model} does not exist"
+                )
+    elif suffix is not None:
+        checkpoint_path = cfg.restore_file.replace(".pt", suffix + ".pt")
+    else:
+        checkpoint_path = cfg.restore_file
+    if cfg.restore_file != "checkpoint_last.pt" and cfg.finetune_from_model:
+        raise ValueError(
+            "--finetune-from-model and --restore-file (non-default value) "
+            "can not be specified together: " + str(cfg)
+        )
+    extra_state = trainer.load_checkpoint(
+        checkpoint_path,
+        reset_optimizer,
+        reset_lr_scheduler,
+        optimizer_overrides,
+        reset_meters=reset_meters,
+    )
+    if (
+        extra_state is not None
+        and "best" in extra_state
+        and not reset_optimizer
+        and not reset_meters
+    ):
+        save_checkpoint.best = extra_state["best"]
+    if extra_state is not None and not reset_dataloader:
+        # restore iterator from checkpoint
+        itr_state = extra_state["train_iterator"]
+        epoch_itr = trainer.get_train_iterator(
+            epoch=itr_state["epoch"], load_dataset=True, **passthrough_args
+        )
+        epoch_itr.load_state_dict(itr_state)
+    else:
+        epoch_itr = trainer.get_train_iterator(
+            epoch=1, load_dataset=True, **passthrough_args
+        )
+    trainer.lr_step(epoch_itr.epoch)
+    return extra_state, epoch_itr
+def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False):
+    """Loads a checkpoint to CPU (with upgrading for backward compatibility).
+    If doing single-GPU training or if the checkpoint is only being loaded by at
+    most one process on each node (current default behavior is for only rank 0
+    to read the checkpoint from disk), load_on_all_ranks should be False to
+    avoid errors from torch.distributed not having been initialized or
+    torch.distributed.barrier() hanging.
+    If all processes on each node may be loading the checkpoint
+    simultaneously, load_on_all_ranks should be set to True to avoid I/O
+    conflicts.
+    There's currently no support for > 1 but < all processes loading the
+    checkpoint on each node.
+    """
+    local_path = PathManager.get_local_path(path)
+    # The locally cached file returned by get_local_path() may be stale for
+    # remote files that are periodically updated/overwritten (ex:
+    # checkpoint_last.pt) - so we remove the local copy, sync across processes
+    # (if needed), and then download a fresh copy.
+    if local_path != path and PathManager.path_requires_pathmanager(path):
+        try:
+            os.remove(local_path)
+        except FileNotFoundError:
+            # With potentially multiple processes removing the same file, the
+            # file being missing is benign (missing_ok isn't available until
+            # Python 3.8).
+            pass
+        if load_on_all_ranks:
+            torch.distributed.barrier()
+        local_path = PathManager.get_local_path(path)
+    with open(local_path, "rb") as f:
+        state = torch.load(f, map_location=torch.device("cpu"))
+    if "args" in state and state["args"] is not None and arg_overrides is not None:
+        args = state["args"]
+        for arg_name, arg_val in arg_overrides.items():
+            setattr(args, arg_name, arg_val)
+    if "cfg" in state and state["cfg"] is not None:
+        # hack to be able to set Namespace in dict config. this should be removed when we update to newer
+        # omegaconf version that supports object flags, or when we migrate all existing models
+        from omegaconf import __version__ as oc_version
+        from omegaconf import _utils
+        if oc_version < "2.2":
+            old_primitive = _utils.is_primitive_type
+            _utils.is_primitive_type = lambda _: True
+            state["cfg"] = OmegaConf.create(state["cfg"])
+            _utils.is_primitive_type = old_primitive
+            OmegaConf.set_struct(state["cfg"], True)
+        else:
+            state["cfg"] = OmegaConf.create(state["cfg"], flags={"allow_objects": True})
+        if arg_overrides is not None:
+            overwrite_args_by_name(state["cfg"], arg_overrides)
+    state = _upgrade_state_dict(state)
+    return state
+def load_model_ensemble(
+    filenames,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    task=None,
+    strict=True,
+    suffix="",
+    num_shards=1,
+    state=None,
+):
+    """Loads an ensemble of models.
+    Args:
+        filenames (List[str]): checkpoint files to load
+        arg_overrides (Dict[str,Any], optional): override model args that
+            were used during model training
+        task (fairseq.tasks.FairseqTask, optional): task to use for loading
+    """
+    assert not (
+        strict and num_shards > 1
+    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
+    ensemble, args, _task = load_model_ensemble_and_task(
+        filenames,
+        arg_overrides,
+        task,
+        strict,
+        suffix,
+        num_shards,
+        state,
+    )
+    return ensemble, args
+def get_maybe_sharded_checkpoint_filename(
+    filename: str, suffix: str, shard_idx: int, num_shards: int
+) -> str:
+    orig_filename = filename
+    filename = filename.replace(".pt", suffix + ".pt")
+    fsdp_filename = filename[:-3] + f"-shard{shard_idx}.pt"
+    model_parallel_filename = orig_filename[:-3] + f"_part{shard_idx}.pt"
+    if PathManager.exists(fsdp_filename):
+        return fsdp_filename
+    elif num_shards > 1:
+        return model_parallel_filename
+    else:
+        return filename
+def load_model_ensemble_and_task(
+    filenames,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    task=None,
+    strict=True,
+    suffix="",
+    num_shards=1,
+    state=None,
+):
+    assert state is None or len(filenames) == 1
+    from fairseq import tasks
+    assert not (
+        strict and num_shards > 1
+    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
+    ensemble = []
+    cfg = None
+    for filename in filenames:
+        orig_filename = filename
+        model_shard_state = {"shard_weights": [], "shard_metadata": []}
+        assert num_shards > 0
+        st = time.time()
+        for shard_idx in range(num_shards):
+            filename = get_maybe_sharded_checkpoint_filename(
+                orig_filename, suffix, shard_idx, num_shards
+            )
+            if not PathManager.exists(filename):
+                raise IOError("Model file not found: {}".format(filename))
+            if state is None:
+                state = load_checkpoint_to_cpu(filename, arg_overrides)
+            if "args" in state and state["args"] is not None:
+                cfg = convert_namespace_to_omegaconf(state["args"])
+            elif "cfg" in state and state["cfg"] is not None:
+                cfg = state["cfg"]
+            else:
+                raise RuntimeError(
+                    f"Neither args nor cfg exist in state keys = {state.keys()}"
+                )
+            if task is None:
+                task = tasks.setup_task(cfg.task)
+            if "task_state" in state:
+                task.load_state_dict(state["task_state"])
+            if "fsdp_metadata" in state and num_shards > 1:
+                model_shard_state["shard_weights"].append(state["model"])
+                model_shard_state["shard_metadata"].append(state["fsdp_metadata"])
+                # check FSDP import before the code goes too far
+                if not has_FSDP:
+                    raise ImportError(
+                        "Cannot find FullyShardedDataParallel. "
+                        "Please install fairscale with: pip install fairscale"
+                    )
+                if shard_idx == num_shards - 1:
+                    consolidated_model_state = FSDP.consolidate_shard_weights(
+                        shard_weights=model_shard_state["shard_weights"],
+                        shard_metadata=model_shard_state["shard_metadata"],
+                    )
+                    model = task.build_model(cfg.model)
+                    if (
+                        "optimizer_history" in state
+                        and len(state["optimizer_history"]) > 0
+                        and "num_updates" in state["optimizer_history"][-1]
+                    ):
+                        model.set_num_updates(
+                            state["optimizer_history"][-1]["num_updates"]
+                        )
+                    model.load_state_dict(
+                        consolidated_model_state, strict=strict, model_cfg=cfg.model
+                    )
+            else:
+                # model parallel checkpoint or unsharded checkpoint
+                # support old external tasks
+                argspec = inspect.getfullargspec(task.build_model)
+                if "from_checkpoint" in argspec.args:
+                    model = task.build_model(cfg.model, from_checkpoint=True)
+                else:
+                    model = task.build_model(cfg.model)
+                if (
+                    "optimizer_history" in state
+                    and len(state["optimizer_history"]) > 0
+                    and "num_updates" in state["optimizer_history"][-1]
+                ):
+                    model.set_num_updates(state["optimizer_history"][-1]["num_updates"])
+                model.load_state_dict(
+                    state["model"], strict=strict, model_cfg=cfg.model
+                )
+            # reset state so it gets loaded for the next model in ensemble
+            state = None
+            if shard_idx % 10 == 0 and shard_idx > 0:
+                elapsed = time.time() - st
+                logger.info(
+                    f"Loaded {shard_idx} shards in {elapsed:.2f}s, {elapsed / (shard_idx+1):.2f}s/shard"
+                )
+        # build model for ensemble
+        ensemble.append(model)
+    return ensemble, cfg, task
+def load_model_ensemble_and_task_from_hf_hub(
+    model_id,
+    cache_dir: Optional[str] = None,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    **kwargs: Any,
+):
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError:
+        raise ImportError(
+            "You need to install huggingface_hub to use `load_from_hf_hub`. "
+            "See https://pypi.org/project/huggingface-hub/ for installation."
+        )
+    library_name = "fairseq"
+    cache_dir = cache_dir or (Path.home() / ".cache" / library_name).as_posix()
+    cache_dir = snapshot_download(
+        model_id, cache_dir=cache_dir, library_name=library_name, **kwargs
+    )
+    _arg_overrides = arg_overrides or {}
+    _arg_overrides["data"] = cache_dir
+    return load_model_ensemble_and_task(
+        [p.as_posix() for p in Path(cache_dir).glob("*.pt")],
+        arg_overrides=_arg_overrides,
+    )
+def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt", keep_match=False):
+    """Retrieves all checkpoints found in `path` directory.
+    Checkpoints are identified by matching filename to the specified pattern. If
+    the pattern contains groups, the result will be sorted by the first group in
+    descending order.
+    """
+    pt_regexp = re.compile(pattern)
+    files = PathManager.ls(path)
+    entries = []
+    for i, f in enumerate(files):
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            idx = float(m.group(1)) if len(m.groups()) > 0 else i
+            entries.append((idx, m.group(0)))
+    if keep_match:
+        return [(os.path.join(path, x[1]), x[0]) for x in sorted(entries, reverse=True)]
+    else:
+        return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)]
+def torch_persistent_save(obj, filename, async_write: bool = False):
+    if async_write:
+        with PathManager.opena(filename, "wb") as f:
+            _torch_persistent_save(obj, f)
+    else:
+        if PathManager.supports_rename(filename):
+            # do atomic save
+            with PathManager.open(filename + ".tmp", "wb") as f:
+                _torch_persistent_save(obj, f)
+            PathManager.rename(filename + ".tmp", filename)
+        else:
+            # fallback to non-atomic save
+            with PathManager.open(filename, "wb") as f:
+                _torch_persistent_save(obj, f)
+def _torch_persistent_save(obj, f):
+    if isinstance(f, str):
+        with PathManager.open(f, "wb") as h:
+            torch_persistent_save(obj, h)
+        return
+    for i in range(3):
+        try:
+            return torch.save(obj, f)
+        except Exception:
+            if i == 2:
+                logger.error(traceback.format_exc())
+                raise
+def _upgrade_state_dict(state):
+    """Helper for upgrading old model checkpoints."""
+    # add optimizer_history
+    if "optimizer_history" not in state:
+        state["optimizer_history"] = [
+            {"criterion_name": "CrossEntropyCriterion", "best_loss": state["best_loss"]}
+        ]
+        state["last_optimizer_state"] = state["optimizer"]
+        del state["optimizer"]
+        del state["best_loss"]
+    # move extra_state into sub-dictionary
+    if "epoch" in state and "extra_state" not in state:
+        state["extra_state"] = {
+            "epoch": state["epoch"],
+            "batch_offset": state["batch_offset"],
+            "val_loss": state["val_loss"],
+        }
+        del state["epoch"]
+        del state["batch_offset"]
+        del state["val_loss"]
+    # reduce optimizer history's memory usage (only keep the last state)
+    if "optimizer" in state["optimizer_history"][-1]:
+        state["last_optimizer_state"] = state["optimizer_history"][-1]["optimizer"]
+        for optim_hist in state["optimizer_history"]:
+            del optim_hist["optimizer"]
+    # record the optimizer class name
+    if "optimizer_name" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["optimizer_name"] = "FairseqNAG"
+    # move best_loss into lr_scheduler_state
+    if "lr_scheduler_state" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["lr_scheduler_state"] = {
+            "best": state["optimizer_history"][-1]["best_loss"]
+        }
+        del state["optimizer_history"][-1]["best_loss"]
+    # keep track of number of updates
+    if "num_updates" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["num_updates"] = 0
+    # use stateful training data iterator
+    if "train_iterator" not in state["extra_state"]:
+        state["extra_state"]["train_iterator"] = {
+            "epoch": state["extra_state"].get("epoch", 0),
+            "iterations_in_epoch": state["extra_state"].get("batch_offset", 0),
+        }
+    # backward compatibility, cfg updates
+    if "args" in state and state["args"] is not None:
+        # old model checkpoints may not have separate source/target positions
+        if hasattr(state["args"], "max_positions") and not hasattr(
+            state["args"], "max_source_positions"
+        ):
+            state["args"].max_source_positions = state["args"].max_positions
+            state["args"].max_target_positions = state["args"].max_positions
+        # default to translation task
+        if not hasattr(state["args"], "task"):
+            state["args"].task = "translation"
+        # --raw-text and --lazy-load are deprecated
+        if getattr(state["args"], "raw_text", False):
+            state["args"].dataset_impl = "raw"
+        elif getattr(state["args"], "lazy_load", False):
+            state["args"].dataset_impl = "lazy"
+        # epochs start at 1
+        if state["extra_state"]["train_iterator"] is not None:
+            state["extra_state"]["train_iterator"]["epoch"] = max(
+                state["extra_state"]["train_iterator"].get("epoch", 1), 1
+            )
+        # --remove-bpe ==> --postprocess
+        if hasattr(state["args"], "remove_bpe"):
+            state["args"].post_process = state["args"].remove_bpe
+        # --min-lr ==> --stop-min-lr
+        if hasattr(state["args"], "min_lr"):
+            state["args"].stop_min_lr = state["args"].min_lr
+            del state["args"].min_lr
+        # binary_cross_entropy / kd_binary_cross_entropy => wav2vec criterion
+        if hasattr(state["args"], "criterion") and state["args"].criterion in [
+            "binary_cross_entropy",
+            "kd_binary_cross_entropy",
+        ]:
+            state["args"].criterion = "wav2vec"
+        # remove log_keys if it's None (criteria will supply a default value of [])
+        if hasattr(state["args"], "log_keys") and state["args"].log_keys is None:
+            delattr(state["args"], "log_keys")
+        # speech_pretraining => audio pretraining
+        if (
+            hasattr(state["args"], "task")
+            and state["args"].task == "speech_pretraining"
+        ):
+            state["args"].task = "audio_pretraining"
+        # audio_cpc => wav2vec
+        if hasattr(state["args"], "arch") and state["args"].arch == "audio_cpc":
+            state["args"].arch = "wav2vec"
+        # convert legacy float learning rate to List[float]
+        if hasattr(state["args"], "lr") and isinstance(state["args"].lr, float):
+            state["args"].lr = [state["args"].lr]
+        # convert task data arg to a string instead of List[string]
+        if (
+            hasattr(state["args"], "data")
+            and isinstance(state["args"].data, list)
+            and len(state["args"].data) > 0
+        ):
+            state["args"].data = state["args"].data[0]
+        state["cfg"] = convert_namespace_to_omegaconf(state["args"])
+    if "cfg" in state and state["cfg"] is not None:
+        cfg = state["cfg"]
+        with open_dict(cfg):
+            # any upgrades for Hydra-based configs
+            if (
+                "task" in cfg
+                and "eval_wer_config" in cfg.task
+                and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
+            ):
+                cfg.task.eval_wer_config.print_alignment = "hard"
+            if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
+                cfg.generation.print_alignment = (
+                    "hard" if cfg.generation.print_alignment else None
+                )
+            if (
+                "model" in cfg
+                and "w2v_args" in cfg.model
+                and cfg.model.w2v_args is not None
+                and (
+                    hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args
+                )
+                and hasattr(cfg.model.w2v_args.task, "eval_wer_config")
+                and cfg.model.w2v_args.task.eval_wer_config is not None
+                and isinstance(
+                    cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
+                )
+            ):
+                cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard"
+    return state
+def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
+    """Prune the given state_dict if desired for LayerDrop
+    (https://arxiv.org/abs/1909.11556).
+    Training with LayerDrop allows models to be robust to pruning at inference
+    time. This function prunes state_dict to allow smaller models to be loaded
+    from a larger model and re-maps the existing state_dict for this to occur.
+    It's called by functions that load models from checkpoints and does not
+    need to be called directly.
+    """
+    arch = None
+    if model_cfg is not None:
+        arch = (
+            model_cfg._name
+            if isinstance(model_cfg, DictConfig)
+            else getattr(model_cfg, "arch", None)
+        )
+    if not model_cfg or arch is None or arch == "ptt_transformer":
+        # args should not be none, but don't crash if it is.
+        return state_dict
+    encoder_layers_to_keep = getattr(model_cfg, "encoder_layers_to_keep", None)
+    decoder_layers_to_keep = getattr(model_cfg, "decoder_layers_to_keep", None)
+    if not encoder_layers_to_keep and not decoder_layers_to_keep:
+        return state_dict
+    # apply pruning
+    logger.info(
+        "Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop"
+    )
+    def create_pruning_pass(layers_to_keep, layer_name):
+        keep_layers = sorted(
+            int(layer_string) for layer_string in layers_to_keep.split(",")
+        )
+        mapping_dict = {}
+        for i in range(len(keep_layers)):
+            mapping_dict[str(keep_layers[i])] = str(i)
+        regex = re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name))
+        return {"substitution_regex": regex, "mapping_dict": mapping_dict}
+    pruning_passes = []
+    if encoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
+    if decoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
+    new_state_dict = {}
+    for layer_name in state_dict.keys():
+        match = re.search(r"\.layers\.(\d+)\.", layer_name)
+        # if layer has no number in it, it is a supporting layer, such as an
+        # embedding
+        if not match:
+            new_state_dict[layer_name] = state_dict[layer_name]
+            continue
+        # otherwise, layer should be pruned.
+        original_layer_number = match.group(1)
+        # figure out which mapping dict to replace from
+        for pruning_pass in pruning_passes:
+            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass[
+                "substitution_regex"
+            ].search(layer_name):
+                new_layer_number = pruning_pass["mapping_dict"][original_layer_number]
+                substitution_match = pruning_pass["substitution_regex"].search(
+                    layer_name
+                )
+                new_state_key = (
+                    layer_name[: substitution_match.start(1)]
+                    + new_layer_number
+                    + layer_name[substitution_match.end(1) :]
+                )
+                new_state_dict[new_state_key] = state_dict[layer_name]
+    # Since layers are now pruned, *_layers_to_keep are no longer needed.
+    # This is more of "It would make it work fix" rather than a proper fix.
+    if isinstance(model_cfg, DictConfig):
+        context = open_dict(model_cfg)
+    else:
+        context = contextlib.ExitStack()
+    with context:
+        if hasattr(model_cfg, "encoder_layers_to_keep"):
+            model_cfg.encoder_layers_to_keep = None
+        if hasattr(model_cfg, "decoder_layers_to_keep"):
+            model_cfg.decoder_layers_to_keep = None
+    return new_state_dict
+def load_pretrained_component_from_model(
+    component: Union[FairseqEncoder, FairseqDecoder],
+    checkpoint: str,
+    strict: bool = True,
+):
+    """
+    Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
+    provided `component` object. If state_dict fails to load, there may be a
+    mismatch in the architecture of the corresponding `component` found in the
+    `checkpoint` file.
+    """
+    if not PathManager.exists(checkpoint):
+        raise IOError("Model file not found: {}".format(checkpoint))
+    state = load_checkpoint_to_cpu(checkpoint)
+    if isinstance(component, FairseqEncoder):
+        component_type = "encoder"
+    elif isinstance(component, FairseqDecoder):
+        component_type = "decoder"
+    else:
+        raise ValueError(
+            "component to load must be either a FairseqEncoder or "
+            "FairseqDecoder. Loading other component types are not supported."
+        )
+    component_state_dict = OrderedDict()
+    for key in state["model"].keys():
+        if key.startswith(component_type):
+            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
+            component_subkey = key[len(component_type) + 1 :]
+            component_state_dict[component_subkey] = state["model"][key]
+    component.load_state_dict(component_state_dict, strict=strict)
+    return component
+def verify_checkpoint_directory(save_dir: str) -> None:
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    temp_file_path = os.path.join(save_dir, "dummy")
+    try:
+        with open(temp_file_path, "w"):
+            pass
+    except OSError as e:
+        logger.warning(
+            "Unable to access checkpoint save directory: {}".format(save_dir)
+        )
+        raise e
+    else:
+        os.remove(temp_file_path)
+def save_ema_as_checkpoint(src_path, dst_path):
+    state = load_ema_from_checkpoint(src_path)
+    torch_persistent_save(state, dst_path)
+def load_ema_from_checkpoint(fpath):
+    """Loads exponential moving averaged (EMA) checkpoint from input and
+    returns a model with ema weights.
+    Args:
+      fpath: A string path of checkpoint to load from.
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = collections.OrderedDict()
+    new_state = None
+    with PathManager.open(fpath, "rb") as f:
+        new_state = torch.load(
+            f,
+            map_location=(
+                lambda s, _: torch.serialization.default_restore_location(s, "cpu")
+            ),
+        )
+        # EMA model is stored in a separate "extra state"
+        model_params = new_state["extra_state"]["ema"]
+        for key in list(model_params.keys()):
+            p = model_params[key]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if key not in params_dict:
+                params_dict[key] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                raise ValueError("Key {} is repeated in EMA model params.".format(key))
+        if len(params_dict) == 0:
+            raise ValueError(
+                f"Input checkpoint path '{fpath}' does not contain "
+                "ema model weights, is this model trained with EMA?"
+            )
+    new_state["model"] = params_dict
+    return new_state

modules/voice_conversion/fairseq/data/__init__.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+from .dictionary import Dictionary, TruncatedDictionary
+from .fairseq_dataset import FairseqDataset, FairseqIterableDataset
+from .base_wrapper_dataset import BaseWrapperDataset
+from .add_target_dataset import AddTargetDataset
+from .append_token_dataset import AppendTokenDataset
+from .audio.raw_audio_dataset import BinarizedAudioDataset, FileAudioDataset
+from .audio.hubert_dataset import HubertDataset
+from .backtranslation_dataset import BacktranslationDataset
+from .bucket_pad_length_dataset import BucketPadLengthDataset
+from .colorize_dataset import ColorizeDataset
+from .concat_dataset import ConcatDataset
+from .concat_sentences_dataset import ConcatSentencesDataset
+from .denoising_dataset import DenoisingDataset
+from .id_dataset import IdDataset
+from .indexed_dataset import (
+    IndexedCachedDataset,
+    IndexedDataset,
+    IndexedRawTextDataset,
+    MMapIndexedDataset,
+)
+from .language_pair_dataset import LanguagePairDataset
+from .list_dataset import ListDataset
+from .lm_context_window_dataset import LMContextWindowDataset
+from .lru_cache_dataset import LRUCacheDataset
+from .mask_tokens_dataset import MaskTokensDataset
+from .monolingual_dataset import MonolingualDataset
+from .multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from .nested_dictionary_dataset import NestedDictionaryDataset
+from .noising import NoisingDataset
+from .numel_dataset import NumelDataset
+from .num_samples_dataset import NumSamplesDataset
+from .offset_tokens_dataset import OffsetTokensDataset
+from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
+from .prepend_dataset import PrependDataset
+from .prepend_token_dataset import PrependTokenDataset
+from .raw_label_dataset import RawLabelDataset
+from .replace_dataset import ReplaceDataset
+from .resampling_dataset import ResamplingDataset
+from .roll_dataset import RollDataset
+from .round_robin_zip_datasets import RoundRobinZipDatasets
+from .sort_dataset import SortDataset
+from .strip_token_dataset import StripTokenDataset
+from .subsample_dataset import SubsampleDataset
+from .token_block_dataset import TokenBlockDataset
+from .transform_eos_dataset import TransformEosDataset
+from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
+from .shorten_dataset import TruncateDataset, RandomCropDataset
+from .multilingual.sampled_multi_dataset import SampledMultiDataset
+from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset
+from .fasta_dataset import FastaDataset, EncodedFastaDataset
+from .transform_eos_concat_langpair_dataset import TransformEosConcatLangPairDataset
+from .iterators import (
+    CountingIterator,
+    EpochBatchIterator,
+    GroupedIterator,
+    ShardedIterator,
+)
+__all__ = [
+    "AddTargetDataset",
+    "AppendTokenDataset",
+    "BacktranslationDataset",
+    "BaseWrapperDataset",
+    "BinarizedAudioDataset",
+    "BucketPadLengthDataset",
+    "ColorizeDataset",
+    "ConcatDataset",
+    "ConcatSentencesDataset",
+    "CountingIterator",
+    "DenoisingDataset",
+    "Dictionary",
+    "EncodedFastaDataset",
+    "EpochBatchIterator",
+    "FairseqDataset",
+    "FairseqIterableDataset",
+    "FastaDataset",
+    "FileAudioDataset",
+    "GroupedIterator",
+    "HubertDataset",
+    "IdDataset",
+    "IndexedCachedDataset",
+    "IndexedDataset",
+    "IndexedRawTextDataset",
+    "LanguagePairDataset",
+    "LeftPadDataset",
+    "ListDataset",
+    "LMContextWindowDataset",
+    "LRUCacheDataset",
+    "MaskTokensDataset",
+    "MMapIndexedDataset",
+    "MonolingualDataset",
+    "MultiCorpusSampledDataset",
+    "NestedDictionaryDataset",
+    "NoisingDataset",
+    "NumelDataset",
+    "NumSamplesDataset",
+    "OffsetTokensDataset",
+    "PadDataset",
+    "PrependDataset",
+    "PrependTokenDataset",
+    "RandomCropDataset",
+    "RawLabelDataset",
+    "ResamplingDataset",
+    "ReplaceDataset",
+    "RightPadDataset",
+    "RollDataset",
+    "RoundRobinZipDatasets",
+    "SampledMultiDataset",
+    "SampledMultiEpochDataset",
+    "ShardedIterator",
+    "SortDataset",
+    "StripTokenDataset",
+    "SubsampleDataset",
+    "TokenBlockDataset",
+    "TransformEosDataset",
+    "TransformEosLangPairDataset",
+    "TransformEosConcatLangPairDataset",
+    "TruncateDataset",
+    "TruncatedDictionary",
+]

modules/voice_conversion/fairseq/data/add_target_dataset.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from . import BaseWrapperDataset, data_utils
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        labels,
+        pad,
+        eos,
+        batch_targets,
+        process_label=None,
+        label_len_fn=None,
+        add_to_input=False,
+        text_compression_level=TextCompressionLevel.none,
+    ):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.label_len_fn = label_len_fn
+        self.add_to_input = add_to_input
+        self.text_compressor = TextCompressor(level=text_compression_level)
+    def get_label(self, index, process_fn=None):
+        lbl = self.labels[index]
+        lbl = self.text_compressor.decompress(lbl)
+        return lbl if process_fn is None else process_fn(lbl)
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index, process_fn=self.process_label)
+        return item
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = self.label_len_fn(self.get_label(index))
+        return sz, own_sz
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+        if self.add_to_input:
+            eos = torch.LongTensor([self.eos])
+            prev_output_tokens = [torch.cat([eos, t], axis=-1) for t in target]
+            target = [torch.cat([t, eos], axis=-1) for t in target]
+            collated["net_input"]["prev_output_tokens"] = prev_output_tokens
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+            if getattr(collated["net_input"], "prev_output_tokens", None):
+                collated["net_input"]["prev_output_tokens"] = data_utils.collate_tokens(
+                    collated["net_input"]["prev_output_tokens"],
+                    pad_idx=self.pad,
+                    left_pad=False,
+                )
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+        collated["target"] = target
+        return collated
+    def filter_indices_by_size(self, indices, max_sizes):
+        indices, ignored = data_utils._filter_by_size_dynamic(
+            indices, self.size, max_sizes
+        )
+        return indices, ignored

modules/voice_conversion/fairseq/data/append_token_dataset.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from . import BaseWrapperDataset
+class AppendTokenDataset(BaseWrapperDataset):
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item, item.new([self.token])])
+        return item
+    @property
+    def sizes(self):
+        return self._sizes
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n

modules/voice_conversion/fairseq/data/audio/__init__.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+import importlib
+import os
+import numpy as np
+class AudioTransform(ABC):
+    @classmethod
+    @abstractmethod
+    def from_config_dict(cls, config: Optional[Dict] = None):
+        pass
+class CompositeAudioTransform(AudioTransform):
+    def _from_config_dict(
+        cls,
+        transform_type,
+        get_audio_transform,
+        composite_cls,
+        config=None,
+        return_empty=False,
+    ):
+        _config = {} if config is None else config
+        _transforms = _config.get(f"{transform_type}_transforms")
+        if _transforms is None:
+            if return_empty:
+                _transforms = []
+            else:
+                return None
+        transforms = [
+            get_audio_transform(_t).from_config_dict(_config.get(_t))
+            for _t in _transforms
+        ]
+        return composite_cls(transforms)
+    def __init__(self, transforms):
+        self.transforms = [t for t in transforms if t is not None]
+    def __call__(self, x):
+        for t in self.transforms:
+            x = t(x)
+        return x
+    def __repr__(self):
+        format_string = (
+            [self.__class__.__name__ + "("]
+            + [f"    {t.__repr__()}" for t in self.transforms]
+            + [")"]
+        )
+        return "\n".join(format_string)
+def register_audio_transform(name, cls_type, registry, class_names):
+    def register_audio_transform_cls(cls):
+        if name in registry:
+            raise ValueError(f"Cannot register duplicate transform ({name})")
+        if not issubclass(cls, cls_type):
+            raise ValueError(
+                f"Transform ({name}: {cls.__name__}) must extend "
+                f"{cls_type.__name__}"
+            )
+        if cls.__name__ in class_names:
+            raise ValueError(
+                f"Cannot register audio transform with duplicate "
+                f"class name ({cls.__name__})"
+            )
+        registry[name] = cls
+        class_names.add(cls.__name__)
+        return cls
+    return register_audio_transform_cls
+def import_transforms(transforms_dir, transform_type):
+    for file in os.listdir(transforms_dir):
+        path = os.path.join(transforms_dir, file)
+        if (
+            not file.startswith("_")
+            and not file.startswith(".")
+            and (file.endswith(".py") or os.path.isdir(path))
+        ):
+            name = file[: file.find(".py")] if file.endswith(".py") else file
+            importlib.import_module(
+                f"fairseq.data.audio.{transform_type}_transforms." + name
+            )
+# Utility fn for uniform numbers in transforms
+def rand_uniform(a, b):
+    return np.random.uniform() * (b - a) + a

modules/voice_conversion/fairseq/data/audio/audio_utils.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import mmap
+from pathlib import Path
+import io
+from typing import BinaryIO, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform
+SF_AUDIO_FILE_EXTENSIONS = {".wav", ".flac", ".ogg"}
+FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS = {".npy", ".wav", ".flac", ".ogg"}
+def convert_waveform(
+    waveform: Union[np.ndarray, torch.Tensor],
+    sample_rate: int,
+    normalize_volume: bool = False,
+    to_mono: bool = False,
+    to_sample_rate: Optional[int] = None,
+) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
+    """convert a waveform:
+    - to a target sample rate
+    - from multi-channel to mono channel
+    - volume normalization
+    Args:
+        waveform (numpy.ndarray or torch.Tensor): 2D original waveform
+            (channels x length)
+        sample_rate (int): original sample rate
+        normalize_volume (bool): perform volume normalization
+        to_mono (bool): convert to mono channel if having multiple channels
+        to_sample_rate (Optional[int]): target sample rate
+    Returns:
+        waveform (numpy.ndarray): converted 2D waveform (channels x length)
+        sample_rate (float): target sample rate
+    """
+    try:
+        import torchaudio.sox_effects as ta_sox
+    except ImportError:
+        raise ImportError("Please install torchaudio: pip install torchaudio")
+    effects = []
+    if normalize_volume:
+        effects.append(["gain", "-n"])
+    if to_sample_rate is not None and to_sample_rate != sample_rate:
+        effects.append(["rate", f"{to_sample_rate}"])
+    if to_mono and waveform.shape[0] > 1:
+        effects.append(["channels", "1"])
+    if len(effects) > 0:
+        is_np_input = isinstance(waveform, np.ndarray)
+        _waveform = torch.from_numpy(waveform) if is_np_input else waveform
+        converted, converted_sample_rate = ta_sox.apply_effects_tensor(
+            _waveform, sample_rate, effects
+        )
+        if is_np_input:
+            converted = converted.numpy()
+        return converted, converted_sample_rate
+    return waveform, sample_rate
+def get_waveform(
+    path_or_fp: Union[str, BinaryIO],
+    normalization: bool = True,
+    mono: bool = True,
+    frames: int = -1,
+    start: int = 0,
+    always_2d: bool = True,
+    output_sample_rate: Optional[int] = None,
+    normalize_volume: bool = False,
+    waveform_transforms: Optional[CompositeAudioWaveformTransform] = None,
+) -> Tuple[np.ndarray, int]:
+    """Get the waveform and sample rate of a 16-bit WAV/FLAC/OGG Vorbis audio.
+    Args:
+        path_or_fp (str or BinaryIO): the path or file-like object
+        normalization (bool): normalize values to [-1, 1] (Default: True)
+        mono (bool): convert multi-channel audio to mono-channel one
+        frames (int): the number of frames to read. (-1 for reading all)
+        start (int): Where to start reading. A negative value counts from the end.
+        always_2d (bool): always return 2D array even for mono-channel audios
+        output_sample_rate (Optional[int]): output sample rate
+        normalize_volume (bool): normalize volume
+    Returns:
+        waveform (numpy.ndarray): 1D or 2D waveform (channels x length)
+        sample_rate (float): sample rate
+    """
+    if isinstance(path_or_fp, str):
+        ext = Path(path_or_fp).suffix
+        if ext not in SF_AUDIO_FILE_EXTENSIONS:
+            raise ValueError(f"Unsupported audio format: {ext}")
+    try:
+        import soundfile as sf
+    except ImportError:
+        raise ImportError("Please install soundfile: pip install soundfile")
+    waveform, sample_rate = sf.read(
+        path_or_fp, dtype="float32", always_2d=True, frames=frames, start=start
+    )
+    waveform = waveform.T  # T x C -> C x T
+    waveform, sample_rate = convert_waveform(
+        waveform,
+        sample_rate,
+        normalize_volume=normalize_volume,
+        to_mono=mono,
+        to_sample_rate=output_sample_rate,
+    )
+    if not normalization:
+        waveform *= 2**15  # denormalized to 16-bit signed integers
+    if waveform_transforms is not None:
+        waveform, sample_rate = waveform_transforms(waveform, sample_rate)
+    if not always_2d:
+        waveform = waveform.squeeze(axis=0)
+    return waveform, sample_rate
+def get_features_from_npy_or_audio(path, waveform_transforms=None):
+    ext = Path(path).suffix
+    if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS:
+        raise ValueError(f'Unsupported file format for "{path}"')
+    return (
+        np.load(path)
+        if ext == ".npy"
+        else get_fbank(path, waveform_transforms=waveform_transforms)
+    )
+def get_features_or_waveform_from_stored_zip(
+    path,
+    byte_offset,
+    byte_size,
+    need_waveform=False,
+    use_sample_rate=None,
+    waveform_transforms=None,
+):
+    assert path.endswith(".zip")
+    data = read_from_stored_zip(path, byte_offset, byte_size)
+    f = io.BytesIO(data)
+    if is_npy_data(data):
+        features_or_waveform = np.load(f)
+    elif is_sf_audio_data(data):
+        features_or_waveform = (
+            get_waveform(
+                f,
+                always_2d=False,
+                output_sample_rate=use_sample_rate,
+                waveform_transforms=waveform_transforms,
+            )[0]
+            if need_waveform
+            else get_fbank(f, waveform_transforms=waveform_transforms)
+        )
+    else:
+        raise ValueError(f'Unknown file format for "{path}"')
+    return features_or_waveform
+def get_features_or_waveform(
+    path: str, need_waveform=False, use_sample_rate=None, waveform_transforms=None
+):
+    """Get speech features from .npy file or waveform from .wav/.flac file.
+    The file may be inside an uncompressed ZIP file and is accessed via byte
+    offset and length.
+    Args:
+        path (str): File path in the format of "<.npy/.wav/.flac path>" or
+        "<zip path>:<byte offset>:<byte length>".
+        need_waveform (bool): return waveform instead of features.
+        use_sample_rate (int): change sample rate for the input wave file
+    Returns:
+        features_or_waveform (numpy.ndarray): speech features or waveform.
+    """
+    _path, slice_ptr = parse_path(path)
+    if len(slice_ptr) == 0:
+        if need_waveform:
+            return get_waveform(
+                _path,
+                always_2d=False,
+                output_sample_rate=use_sample_rate,
+                waveform_transforms=waveform_transforms,
+            )[0]
+        return get_features_from_npy_or_audio(
+            _path, waveform_transforms=waveform_transforms
+        )
+    elif len(slice_ptr) == 2:
+        features_or_waveform = get_features_or_waveform_from_stored_zip(
+            _path,
+            slice_ptr[0],
+            slice_ptr[1],
+            need_waveform=need_waveform,
+            use_sample_rate=use_sample_rate,
+            waveform_transforms=waveform_transforms,
+        )
+    else:
+        raise ValueError(f"Invalid path: {path}")
+    return features_or_waveform
+def _get_kaldi_fbank(
+    waveform: np.ndarray, sample_rate: int, n_bins=80
+) -> Optional[np.ndarray]:
+    """Get mel-filter bank features via PyKaldi."""
+    try:
+        from kaldi.feat.fbank import Fbank, FbankOptions
+        from kaldi.feat.mel import MelBanksOptions
+        from kaldi.feat.window import FrameExtractionOptions
+        from kaldi.matrix import Vector
+        mel_opts = MelBanksOptions()
+        mel_opts.num_bins = n_bins
+        frame_opts = FrameExtractionOptions()
+        frame_opts.samp_freq = sample_rate
+        opts = FbankOptions()
+        opts.mel_opts = mel_opts
+        opts.frame_opts = frame_opts
+        fbank = Fbank(opts=opts)
+        features = fbank.compute(Vector(waveform.squeeze()), 1.0).numpy()
+        return features
+    except ImportError:
+        return None
+def _get_torchaudio_fbank(
+    waveform: np.ndarray, sample_rate, n_bins=80
+) -> Optional[np.ndarray]:
+    """Get mel-filter bank features via TorchAudio."""
+    try:
+        import torchaudio.compliance.kaldi as ta_kaldi
+        waveform = torch.from_numpy(waveform)
+        features = ta_kaldi.fbank(
+            waveform, num_mel_bins=n_bins, sample_frequency=sample_rate
+        )
+        return features.numpy()
+    except ImportError:
+        return None
+def get_fbank(
+    path_or_fp: Union[str, BinaryIO], n_bins=80, waveform_transforms=None
+) -> np.ndarray:
+    """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi
+    (faster CPP implementation) to TorchAudio (Python implementation). Note that
+    Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the
+    waveform should not be normalized."""
+    waveform, sample_rate = get_waveform(
+        path_or_fp, normalization=False, waveform_transforms=waveform_transforms
+    )
+    features = _get_kaldi_fbank(waveform, sample_rate, n_bins)
+    if features is None:
+        features = _get_torchaudio_fbank(waveform, sample_rate, n_bins)
+    if features is None:
+        raise ImportError(
+            "Please install pyKaldi or torchaudio to enable "
+            "online filterbank feature extraction"
+        )
+    return features
+def is_npy_data(data: bytes) -> bool:
+    return data[0] == 147 and data[1] == 78
+def is_sf_audio_data(data: bytes) -> bool:
+    is_wav = data[0] == 82 and data[1] == 73 and data[2] == 70
+    is_flac = data[0] == 102 and data[1] == 76 and data[2] == 97
+    is_ogg = data[0] == 79 and data[1] == 103 and data[2] == 103
+    return is_wav or is_flac or is_ogg
+def mmap_read(path: str, offset: int, length: int) -> bytes:
+    with open(path, "rb") as f:
+        with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_o:
+            data = mmap_o[offset : offset + length]
+    return data
+def read_from_stored_zip(zip_path: str, offset: int, length: int) -> bytes:
+    return mmap_read(zip_path, offset, length)
+def parse_path(path: str) -> Tuple[str, List[int]]:
+    """Parse data path which is either a path to
+    1. a .npy/.wav/.flac/.ogg file
+    2. a stored ZIP file with slicing info: "[zip_path]:[offset]:[length]"
+      Args:
+          path (str): the data path to parse
+      Returns:
+          file_path (str): the file path
+          slice_ptr (list of int): empty in case 1;
+            byte offset and length for the slice in case 2
+    """
+    if Path(path).suffix in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS:
+        _path, slice_ptr = path, []
+    else:
+        _path, *slice_ptr = path.split(":")
+        if not Path(_path).is_file():
+            raise FileNotFoundError(f"File not found: {_path}")
+    assert len(slice_ptr) in {0, 2}, f"Invalid path: {path}"
+    slice_ptr = [int(i) for i in slice_ptr]
+    return _path, slice_ptr
+def get_window(window_fn: callable, n_fft: int, win_length: int) -> torch.Tensor:
+    padding = n_fft - win_length
+    assert padding >= 0
+    return F.pad(window_fn(win_length), (padding // 2, padding - padding // 2))
+def get_fourier_basis(n_fft: int) -> torch.Tensor:
+    basis = np.fft.fft(np.eye(n_fft))
+    basis = np.vstack(
+        [np.real(basis[: n_fft // 2 + 1, :]), np.imag(basis[: n_fft // 2 + 1, :])]
+    )
+    return torch.from_numpy(basis).float()
+def get_mel_filters(
+    sample_rate: int, n_fft: int, n_mels: int, f_min: float, f_max: float
+) -> torch.Tensor:
+    try:
+        import librosa
+    except ImportError:
+        raise ImportError("Please install librosa: pip install librosa")
+    basis = librosa.filters.mel(sample_rate, n_fft, n_mels, f_min, f_max)
+    return torch.from_numpy(basis).float()
+class TTSSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        n_fft: int,
+        win_length: int,
+        hop_length: int,
+        window_fn: callable = torch.hann_window,
+        return_phase: bool = False,
+    ) -> None:
+        super(TTSSpectrogram, self).__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.return_phase = return_phase
+        basis = get_fourier_basis(n_fft).unsqueeze(1)
+        basis *= get_window(window_fn, n_fft, win_length)
+        self.register_buffer("basis", basis)
+    def forward(
+        self, waveform: torch.Tensor
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        padding = (self.n_fft // 2, self.n_fft // 2)
+        x = F.pad(waveform.unsqueeze(1), padding, mode="reflect")
+        x = F.conv1d(x, self.basis, stride=self.hop_length)
+        real_part = x[:, : self.n_fft // 2 + 1, :]
+        imag_part = x[:, self.n_fft // 2 + 1 :, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        if self.return_phase:
+            phase = torch.atan2(imag_part, real_part)
+            return magnitude, phase
+        return magnitude
+class TTSMelScale(torch.nn.Module):
+    def __init__(
+        self, n_mels: int, sample_rate: int, f_min: float, f_max: float, n_stft: int
+    ) -> None:
+        super(TTSMelScale, self).__init__()
+        basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max)
+        self.register_buffer("basis", basis)
+    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
+        return torch.matmul(self.basis, specgram)

modules/voice_conversion/fairseq/data/audio/data_cfg.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from argparse import Namespace
+from copy import deepcopy
+from pathlib import Path
+from typing import Dict, Optional
+from fairseq.data import Dictionary
+logger = logging.getLogger(__name__)
+def get_config_from_yaml(yaml_path: Path):
+    try:
+        import yaml
+    except ImportError:
+        print("Please install PyYAML: pip install PyYAML")
+    config = {}
+    if yaml_path.is_file():
+        try:
+            with open(yaml_path) as f:
+                config = yaml.load(f, Loader=yaml.FullLoader)
+        except Exception as e:
+            raise Exception(f"Failed to load config from {yaml_path.as_posix()}: {e}")
+    else:
+        raise FileNotFoundError(f"{yaml_path.as_posix()} not found")
+    return config
+class S2TDataConfig(object):
+    """Wrapper class for data config YAML"""
+    def __init__(self, yaml_path: Path):
+        self.config = get_config_from_yaml(yaml_path)
+        self.root = yaml_path.parent
+    def _auto_convert_to_abs_path(self, x):
+        if isinstance(x, str):
+            if not Path(x).exists() and (self.root / x).exists():
+                return (self.root / x).as_posix()
+        elif isinstance(x, dict):
+            return {k: self._auto_convert_to_abs_path(v) for k, v in x.items()}
+        return x
+    @property
+    def vocab_filename(self):
+        """fairseq vocabulary file under data root"""
+        return self.config.get("vocab_filename", "dict.txt")
+    @property
+    def speaker_set_filename(self):
+        """speaker set file under data root"""
+        return self.config.get("speaker_set_filename", None)
+    @property
+    def shuffle(self) -> bool:
+        """Shuffle dataset samples before batching"""
+        return self.config.get("shuffle", False)
+    @property
+    def pre_tokenizer(self) -> Dict:
+        """Pre-tokenizer to apply before subword tokenization. Returning
+        a dictionary with `tokenizer` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        tokenizer = self.config.get("pre_tokenizer", {"tokenizer": None})
+        return self._auto_convert_to_abs_path(tokenizer)
+    @property
+    def bpe_tokenizer(self) -> Dict:
+        """Subword tokenizer to apply after pre-tokenization. Returning
+        a dictionary with `bpe` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        tokenizer = self.config.get("bpe_tokenizer", {"bpe": None})
+        return self._auto_convert_to_abs_path(tokenizer)
+    @property
+    def prepend_tgt_lang_tag(self) -> bool:
+        """Prepend target lang ID token as the target BOS (e.g. for to-many
+        multilingual setting). During inference, this requires `--prefix-size 1`
+        to force BOS to be lang ID token."""
+        return self.config.get("prepend_tgt_lang_tag", False)
+    @property
+    def prepend_bos_and_append_tgt_lang_tag(self) -> bool:
+        """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining)."""
+        return self.config.get("prepend_bos_and_append_tgt_lang_tag", False)
+    @property
+    def input_feat_per_channel(self):
+        """The dimension of input features (per audio channel)"""
+        return self.config.get("input_feat_per_channel", 80)
+    @property
+    def input_channels(self):
+        """The number of channels in the input audio"""
+        return self.config.get("input_channels", 1)
+    @property
+    def sample_rate(self):
+        return self.config.get("sample_rate", 16_000)
+    @property
+    def sampling_alpha(self):
+        """Hyper-parameter alpha = 1/T for temperature-based resampling.
+        (alpha = 1 for no resampling)"""
+        return self.config.get("sampling_alpha", 1.0)
+    @property
+    def use_audio_input(self):
+        """Needed by the dataset loader to see if the model requires
+        raw audio as inputs."""
+        return self.config.get("use_audio_input", False)
+    def standardize_audio(self) -> bool:
+        return self.use_audio_input and self.config.get("standardize_audio", False)
+    @property
+    def use_sample_rate(self):
+        """Needed by the dataset loader to see if the model requires
+        raw audio with specific sample rate as inputs."""
+        return self.config.get("use_sample_rate", 16000)
+    @property
+    def audio_root(self):
+        """Audio paths in the manifest TSV can be relative and this provides
+        the root path. Set this to empty string when using absolute paths."""
+        return self.config.get("audio_root", "")
+    def get_transforms(self, transform_type, split, is_train):
+        """Split-specific feature transforms. Allowing train set
+        wildcard `_train`, evaluation set wildcard `_eval` and general
+        wildcard `*` for matching."""
+        from copy import deepcopy
+        cfg = deepcopy(self.config)
+        _cur = cfg.get(f"{transform_type}transforms", {})
+        cur = _cur.get(split)
+        cur = _cur.get("_train") if cur is None and is_train else cur
+        cur = _cur.get("_eval") if cur is None and not is_train else cur
+        cur = _cur.get("*") if cur is None else cur
+        return cur
+    def get_feature_transforms(self, split, is_train):
+        cfg = deepcopy(self.config)
+        # TODO: deprecate transforms
+        cur = self.get_transforms("", split, is_train)
+        if cur is not None:
+            logger.warning(
+                "Auto converting transforms into feature_transforms, "
+                "but transforms will be deprecated in the future. Please "
+                "update this in the config."
+            )
+            ft_transforms = self.get_transforms("feature_", split, is_train)
+            if ft_transforms:
+                cur.extend(ft_transforms)
+        else:
+            cur = self.get_transforms("feature_", split, is_train)
+        cfg["feature_transforms"] = cur
+        return cfg
+    def get_waveform_transforms(self, split, is_train):
+        cfg = deepcopy(self.config)
+        cfg["waveform_transforms"] = self.get_transforms("waveform_", split, is_train)
+        return cfg
+    def get_dataset_transforms(self, split, is_train):
+        cfg = deepcopy(self.config)
+        cfg["dataset_transforms"] = self.get_transforms("dataset_", split, is_train)
+        return cfg
+    @property
+    def global_cmvn_stats_npz(self) -> Optional[str]:
+        path = self.config.get("global_cmvn", {}).get("stats_npz_path", None)
+        return self._auto_convert_to_abs_path(path)
+    @property
+    def vocoder(self) -> Dict[str, str]:
+        vocoder = self.config.get("vocoder", {"type": "griffin_lim"})
+        return self._auto_convert_to_abs_path(vocoder)
+    @property
+    def hub(self) -> Dict[str, str]:
+        return self.config.get("hub", {})
+class S2SDataConfig(S2TDataConfig):
+    """Wrapper class for data config YAML"""
+    @property
+    def vocab_filename(self):
+        """fairseq vocabulary file under data root"""
+        return self.config.get("vocab_filename", None)
+    @property
+    def pre_tokenizer(self) -> Dict:
+        return None
+    @property
+    def bpe_tokenizer(self) -> Dict:
+        return None
+    @property
+    def input_transformed_channels(self):
+        """The number of channels in the audio after feature transforms"""
+        # TODO: move this into individual transforms
+        # TODO: deprecate transforms
+        _cur = self.config.get("transforms", {})
+        ft_transforms = self.config.get("feature_transforms", {})
+        if _cur and ft_transforms:
+            _cur.update(ft_transforms)
+        else:
+            _cur = self.config.get("feature_transforms", {})
+        cur = _cur.get("_train", [])
+        _channels = self.input_channels
+        if "delta_deltas" in cur:
+            _channels *= 3
+        return _channels
+    @property
+    def output_sample_rate(self):
+        """The audio sample rate of output target speech"""
+        return self.config.get("output_sample_rate", 22050)
+    @property
+    def target_speaker_embed(self):
+        """Target speaker embedding file (one line per target audio sample)"""
+        return self.config.get("target_speaker_embed", None)
+    @property
+    def prepend_tgt_lang_tag_as_bos(self) -> bool:
+        """Prepend target lang ID token as the target BOS."""
+        return self.config.get("prepend_tgt_lang_tag_as_bos", False)
+class MultitaskConfig(object):
+    """Wrapper class for data config YAML"""
+    def __init__(self, yaml_path: Path):
+        config = get_config_from_yaml(yaml_path)
+        self.config = {}
+        for k, v in config.items():
+            self.config[k] = SingleTaskConfig(k, v)
+    def get_all_tasks(self):
+        return self.config
+    def get_single_task(self, name):
+        assert name in self.config, f"multitask '{name}' does not exist!"
+        return self.config[name]
+    @property
+    def first_pass_decoder_task_index(self):
+        """Return the task index of the first-pass text decoder.
+        If there are multiple 'is_first_pass_decoder: True' in the config file,
+            the last task is used for the first-pass decoder.
+        If there is no 'is_first_pass_decoder: True' in the config file,
+            the last task whose task_name includes 'target' and decoder_type is not ctc.
+        """
+        idx = -1
+        for i, (k, v) in enumerate(self.config.items()):
+            if v.is_first_pass_decoder:
+                idx = i
+        if idx < 0:
+            for i, (k, v) in enumerate(self.config.items()):
+                if k.startswith("target") and v.decoder_type == "transformer":
+                    idx = i
+        return idx
+class SingleTaskConfig(object):
+    def __init__(self, name, config):
+        self.task_name = name
+        self.config = config
+        dict_path = config.get("dict", "")
+        self.tgt_dict = Dictionary.load(dict_path) if Path(dict_path).exists() else None
+    @property
+    def data(self):
+        return self.config.get("data", "")
+    @property
+    def decoder_type(self):
+        return self.config.get("decoder_type", "transformer")
+    @property
+    def decoder_args(self):
+        """Decoder arch related args"""
+        args = self.config.get("decoder_args", {})
+        return Namespace(**args)
+    @property
+    def criterion_cfg(self):
+        """cfg for the multitask criterion"""
+        if self.decoder_type == "ctc":
+            from fairseq.criterions.ctc import CtcCriterionConfig
+            cfg = CtcCriterionConfig
+            cfg.zero_infinity = self.config.get("zero_infinity", True)
+        else:
+            from fairseq.criterions.label_smoothed_cross_entropy import (
+                LabelSmoothedCrossEntropyCriterionConfig,
+            )
+            cfg = LabelSmoothedCrossEntropyCriterionConfig
+            cfg.label_smoothing = self.config.get("label_smoothing", 0.2)
+        return cfg
+    @property
+    def input_from(self):
+        """Condition on encoder/decoder of the main model"""
+        return "decoder" if "decoder_layer" in self.config else "encoder"
+    @property
+    def input_layer(self):
+        if self.input_from == "decoder":
+            return self.config["decoder_layer"] - 1
+        else:
+            # default using the output from the last encoder layer (-1)
+            return self.config.get("encoder_layer", 0) - 1
+    @property
+    def loss_weight_schedule(self):
+        return (
+            "decay"
+            if "loss_weight_max" in self.config
+            and "loss_weight_decay_steps" in self.config
+            else "fixed"
+        )
+    def get_loss_weight(self, num_updates):
+        if self.loss_weight_schedule == "fixed":
+            weight = self.config.get("loss_weight", 1.0)
+        else:  # "decay"
+            assert (
+                self.config.get("loss_weight_decay_steps", 0) > 0
+            ), "loss_weight_decay_steps must be greater than 0 for a decay schedule"
+            loss_weight_min = self.config.get("loss_weight_min", 0.0001)
+            loss_weight_decay_stepsize = (
+                self.config["loss_weight_max"] - loss_weight_min
+            ) / self.config["loss_weight_decay_steps"]
+            weight = max(
+                self.config["loss_weight_max"]
+                - loss_weight_decay_stepsize * num_updates,
+                loss_weight_min,
+            )
+        return weight
+    @property
+    def prepend_bos_and_append_tgt_lang_tag(self) -> bool:
+        """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining)."""
+        return self.config.get("prepend_bos_and_append_tgt_lang_tag", False)
+    @property
+    def eos_token(self):
+        """EOS token during generation"""
+        return self.config.get("eos_token", "<eos>")
+    @property
+    def rdrop_alpha(self):
+        return self.config.get("rdrop_alpha", 0.0)
+    @property
+    def is_first_pass_decoder(self):
+        flag = self.config.get("is_first_pass_decoder", False)
+        if flag:
+            if self.decoder_type == "ctc":
+                raise ValueError(
+                    "First-pass decoder in the multi-decoder model must not be CTC."
+                )
+            if "target" not in self.task_name:
+                raise Warning(
+                    'The name of the first-pass decoder does not include "target".'
+                )
+        return flag
+    @property
+    def get_lang_tag_mapping(self):
+        return self.config.get("lang_tag_mapping", {})

modules/voice_conversion/fairseq/data/audio/dataset_transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from fairseq.data.audio import (
+    AudioTransform,
+    CompositeAudioTransform,
+    import_transforms,
+    register_audio_transform,
+)
+class AudioDatasetTransform(AudioTransform):
+    pass
+AUDIO_DATASET_TRANSFORM_REGISTRY = {}
+AUDIO_DATASET_TRANSFORM_CLASS_NAMES = set()
+def get_audio_dataset_transform(name):
+    return AUDIO_DATASET_TRANSFORM_REGISTRY[name]
+def register_audio_dataset_transform(name):
+    return register_audio_transform(
+        name,
+        AudioDatasetTransform,
+        AUDIO_DATASET_TRANSFORM_REGISTRY,
+        AUDIO_DATASET_TRANSFORM_CLASS_NAMES,
+    )
+import_transforms(os.path.dirname(__file__), "dataset")
+class CompositeAudioDatasetTransform(CompositeAudioTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        return super()._from_config_dict(
+            cls,
+            "dataset",
+            get_audio_dataset_transform,
+            CompositeAudioDatasetTransform,
+            config,
+            return_empty=True,
+        )
+    def get_transform(self, cls):
+        for t in self.transforms:
+            if isinstance(t, cls):
+                return t
+        return None
+    def has_transform(self, cls):
+        return self.get_transform(cls) is not None

modules/voice_conversion/fairseq/data/audio/dataset_transforms/concataugment.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import List
+import numpy as np
+from fairseq.data.audio.dataset_transforms import (
+    AudioDatasetTransform,
+    register_audio_dataset_transform,
+)
+_DEFAULTS = {"rate": 0.25, "max_tokens": 3000, "attempts": 5}
+@register_audio_dataset_transform("concataugment")
+class ConcatAugment(AudioDatasetTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return ConcatAugment(
+            _config.get("rate", _DEFAULTS["rate"]),
+            _config.get("max_tokens", _DEFAULTS["max_tokens"]),
+            _config.get("attempts", _DEFAULTS["attempts"]),
+        )
+    def __init__(
+        self,
+        rate=_DEFAULTS["rate"],
+        max_tokens=_DEFAULTS["max_tokens"],
+        attempts=_DEFAULTS["attempts"],
+    ):
+        self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"rate={self.rate}",
+                    f"max_tokens={self.max_tokens}",
+                    f"attempts={self.attempts}",
+                ]
+            )
+            + ")"
+        )
+    def find_indices(self, index: int, n_frames: List[int], n_samples: int):
+        # skip conditions: application rate, max_tokens limit exceeded
+        if np.random.random() > self.rate:
+            return [index]
+        if self.max_tokens and n_frames[index] > self.max_tokens:
+            return [index]
+        # pick second sample to concatenate
+        for _ in range(self.attempts):
+            index2 = np.random.randint(0, n_samples)
+            if index2 != index and (
+                not self.max_tokens
+                or n_frames[index] + n_frames[index2] < self.max_tokens
+            ):
+                return [index, index2]
+        return [index]

modules/voice_conversion/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import numpy as np
+import torch
+from fairseq.data.audio import rand_uniform
+from fairseq.data.audio.dataset_transforms import (
+    AudioDatasetTransform,
+    register_audio_dataset_transform,
+)
+from fairseq.data.audio.waveform_transforms.noiseaugment import (
+    NoiseAugmentTransform,
+)
+_DEFAULTS = {
+    "rate": 0.25,
+    "mixing_noise_rate": 0.1,
+    "noise_path": "",
+    "noise_snr_min": -5,
+    "noise_snr_max": 5,
+    "utterance_snr_min": -5,
+    "utterance_snr_max": 5,
+}
+@register_audio_dataset_transform("noisyoverlapaugment")
+class NoisyOverlapAugment(AudioDatasetTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return NoisyOverlapAugment(
+            _config.get("rate", _DEFAULTS["rate"]),
+            _config.get("mixing_noise_rate", _DEFAULTS["mixing_noise_rate"]),
+            _config.get("noise_path", _DEFAULTS["noise_path"]),
+            _config.get("noise_snr_min", _DEFAULTS["noise_snr_min"]),
+            _config.get("noise_snr_max", _DEFAULTS["noise_snr_max"]),
+            _config.get("utterance_snr_min", _DEFAULTS["utterance_snr_min"]),
+            _config.get("utterance_snr_max", _DEFAULTS["utterance_snr_max"]),
+        )
+    def __init__(
+        self,
+        rate=_DEFAULTS["rate"],
+        mixing_noise_rate=_DEFAULTS["mixing_noise_rate"],
+        noise_path=_DEFAULTS["noise_path"],
+        noise_snr_min=_DEFAULTS["noise_snr_min"],
+        noise_snr_max=_DEFAULTS["noise_snr_max"],
+        utterance_snr_min=_DEFAULTS["utterance_snr_min"],
+        utterance_snr_max=_DEFAULTS["utterance_snr_max"],
+    ):
+        self.rate = rate
+        self.mixing_noise_rate = mixing_noise_rate
+        self.noise_shaper = NoiseAugmentTransform(noise_path)
+        self.noise_snr_min = noise_snr_min
+        self.noise_snr_max = noise_snr_max
+        self.utterance_snr_min = utterance_snr_min
+        self.utterance_snr_max = utterance_snr_max
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"rate={self.rate}",
+                    f"mixing_noise_rate={self.mixing_noise_rate}",
+                    f"noise_snr_min={self.noise_snr_min}",
+                    f"noise_snr_max={self.noise_snr_max}",
+                    f"utterance_snr_min={self.utterance_snr_min}",
+                    f"utterance_snr_max={self.utterance_snr_max}",
+                ]
+            )
+            + ")"
+        )
+    def __call__(self, sources):
+        for i, source in enumerate(sources):
+            if np.random.random() > self.rate:
+                continue
+            pri = source.numpy()
+            if np.random.random() > self.mixing_noise_rate:
+                sec = sources[np.random.randint(0, len(sources))].numpy()
+                snr = rand_uniform(self.utterance_snr_min, self.utterance_snr_max)
+            else:
+                sec = self.noise_shaper.pick_sample(source.shape)
+                snr = rand_uniform(self.noise_snr_min, self.noise_snr_max)
+            L1 = pri.shape[-1]
+            L2 = sec.shape[-1]
+            l = np.random.randint(0, min(round(L1 / 2), L2))  # mix len
+            s_source = np.random.randint(0, L1 - l)
+            s_sec = np.random.randint(0, L2 - l)
+            get_power = lambda x: np.mean(x**2)
+            if get_power(sec) == 0:
+                continue
+            scl = np.sqrt(get_power(pri) / (np.power(10, snr / 10) * get_power(sec)))
+            pri[s_source : s_source + l] = np.add(
+                pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l])
+            )
+            sources[i] = torch.from_numpy(pri).float()
+        return sources

modules/voice_conversion/fairseq/data/audio/feature_transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+from fairseq.data.audio import (
+    AudioTransform,
+    CompositeAudioTransform,
+    import_transforms,
+    register_audio_transform,
+)
+class AudioFeatureTransform(AudioTransform):
+    pass
+AUDIO_FEATURE_TRANSFORM_REGISTRY = {}
+AUDIO_FEATURE_TRANSFORM_CLASS_NAMES = set()
+def get_audio_feature_transform(name):
+    return AUDIO_FEATURE_TRANSFORM_REGISTRY[name]
+def register_audio_feature_transform(name):
+    return register_audio_transform(
+        name,
+        AudioFeatureTransform,
+        AUDIO_FEATURE_TRANSFORM_REGISTRY,
+        AUDIO_FEATURE_TRANSFORM_CLASS_NAMES,
+    )
+import_transforms(os.path.dirname(__file__), "feature")
+class CompositeAudioFeatureTransform(CompositeAudioTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        return super()._from_config_dict(
+            cls,
+            "feature",
+            get_audio_feature_transform,
+            CompositeAudioFeatureTransform,
+            config,
+        )

modules/voice_conversion/fairseq/data/audio/feature_transforms/delta_deltas.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import torch
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+@register_audio_feature_transform("delta_deltas")
+class DeltaDeltas(AudioFeatureTransform):
+    """Expand delta-deltas features from spectrum."""
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return DeltaDeltas(_config.get("win_length", 5))
+    def __init__(self, win_length=5):
+        self.win_length = win_length
+    def __repr__(self):
+        return self.__class__.__name__
+    def __call__(self, spectrogram):
+        from torchaudio.functional import compute_deltas
+        assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor."
+        # spectrogram is T x F, while compute_deltas takes (…, F, T)
+        spectrogram = torch.from_numpy(spectrogram).transpose(0, 1)
+        delta = compute_deltas(spectrogram)
+        delta_delta = compute_deltas(delta)
+        out_feat = np.concatenate(
+            [spectrogram, delta.numpy(), delta_delta.numpy()], axis=0
+        )
+        out_feat = np.transpose(out_feat)
+        return out_feat

modules/voice_conversion/fairseq/data/audio/feature_transforms/global_cmvn.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+@register_audio_feature_transform("global_cmvn")
+class GlobalCMVN(AudioFeatureTransform):
+    """Global CMVN (cepstral mean and variance normalization). The global mean
+    and variance need to be pre-computed and stored in NumPy format (.npz)."""
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return GlobalCMVN(_config.get("stats_npz_path"))
+    def __init__(self, stats_npz_path):
+        self.stats_npz_path = stats_npz_path
+        stats = np.load(stats_npz_path)
+        self.mean, self.std = stats["mean"], stats["std"]
+    def __repr__(self):
+        return self.__class__.__name__ + f'(stats_npz_path="{self.stats_npz_path}")'
+    def __call__(self, x):
+        x = np.subtract(x, self.mean)
+        x = np.divide(x, self.std)
+        return x

modules/voice_conversion/fairseq/data/audio/feature_transforms/specaugment.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import math
+import numbers
+from typing import Optional
+import numpy as np
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+@register_audio_feature_transform("specaugment")
+class SpecAugmentTransform(AudioFeatureTransform):
+    """SpecAugment (https://arxiv.org/abs/1904.08779)"""
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return SpecAugmentTransform(
+            _config.get("time_warp_W", 0),
+            _config.get("freq_mask_N", 0),
+            _config.get("freq_mask_F", 0),
+            _config.get("time_mask_N", 0),
+            _config.get("time_mask_T", 0),
+            _config.get("time_mask_p", 0.0),
+            _config.get("mask_value", None),
+        )
+    def __init__(
+        self,
+        time_warp_w: int = 0,
+        freq_mask_n: int = 0,
+        freq_mask_f: int = 0,
+        time_mask_n: int = 0,
+        time_mask_t: int = 0,
+        time_mask_p: float = 0.0,
+        mask_value: Optional[float] = 0.0,
+    ):
+        # Sanity checks
+        assert mask_value is None or isinstance(
+            mask_value, numbers.Number
+        ), f"mask_value (type: {type(mask_value)}) must be None or a number"
+        if freq_mask_n > 0:
+            assert freq_mask_f > 0, (
+                f"freq_mask_F ({freq_mask_f}) "
+                f"must be larger than 0 when doing freq masking."
+            )
+        if time_mask_n > 0:
+            assert time_mask_t > 0, (
+                f"time_mask_T ({time_mask_t}) must be larger than 0 when "
+                f"doing time masking."
+            )
+        self.time_warp_w = time_warp_w
+        self.freq_mask_n = freq_mask_n
+        self.freq_mask_f = freq_mask_f
+        self.time_mask_n = time_mask_n
+        self.time_mask_t = time_mask_t
+        self.time_mask_p = time_mask_p
+        self.mask_value = mask_value
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"time_warp_w={self.time_warp_w}",
+                    f"freq_mask_n={self.freq_mask_n}",
+                    f"freq_mask_f={self.freq_mask_f}",
+                    f"time_mask_n={self.time_mask_n}",
+                    f"time_mask_t={self.time_mask_t}",
+                    f"time_mask_p={self.time_mask_p}",
+                ]
+            )
+            + ")"
+        )
+    def __call__(self, spectrogram):
+        assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor."
+        distorted = spectrogram.copy()  # make a copy of input spectrogram.
+        num_frames = spectrogram.shape[0]  # or 'tau' in the paper.
+        num_freqs = spectrogram.shape[1]  # or 'miu' in the paper.
+        mask_value = self.mask_value
+        if mask_value is None:  # if no value was specified, use local mean.
+            mask_value = spectrogram.mean()
+        if num_frames == 0:
+            return spectrogram
+        if num_freqs < self.freq_mask_f:
+            return spectrogram
+        if self.time_warp_w > 0:
+            if 2 * self.time_warp_w < num_frames:
+                import cv2
+                w0 = np.random.randint(self.time_warp_w, num_frames - self.time_warp_w)
+                w = np.random.randint(-self.time_warp_w + 1, self.time_warp_w)
+                upper, lower = distorted[:w0, :], distorted[w0:, :]
+                upper = cv2.resize(
+                    upper, dsize=(num_freqs, w0 + w), interpolation=cv2.INTER_LINEAR
+                )
+                lower = cv2.resize(
+                    lower,
+                    dsize=(num_freqs, num_frames - w0 - w),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+                distorted = np.concatenate((upper, lower), axis=0)
+        for _i in range(self.freq_mask_n):
+            f = np.random.randint(0, self.freq_mask_f)
+            f0 = np.random.randint(0, num_freqs - f)
+            if f != 0:
+                distorted[:, f0 : f0 + f] = mask_value
+        max_time_mask_t = min(
+            self.time_mask_t, math.floor(num_frames * self.time_mask_p)
+        )
+        if max_time_mask_t < 1:
+            return distorted
+        for _i in range(self.time_mask_n):
+            t = np.random.randint(0, max_time_mask_t)
+            t0 = np.random.randint(0, num_frames - t)
+            if t != 0:
+                distorted[t0 : t0 + t, :] = mask_value
+        return distorted

modules/voice_conversion/fairseq/data/audio/feature_transforms/utterance_cmvn.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+@register_audio_feature_transform("utterance_cmvn")
+class UtteranceCMVN(AudioFeatureTransform):
+    """Utterance-level CMVN (cepstral mean and variance normalization)"""
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return UtteranceCMVN(
+            _config.get("norm_means", True),
+            _config.get("norm_vars", True),
+        )
+    def __init__(self, norm_means=True, norm_vars=True):
+        self.norm_means, self.norm_vars = norm_means, norm_vars
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + f"(norm_means={self.norm_means}, norm_vars={self.norm_vars})"
+        )
+    def __call__(self, x):
+        mean = x.mean(axis=0)
+        square_sums = (x**2).sum(axis=0)
+        if self.norm_means:
+            x = np.subtract(x, mean)
+        if self.norm_vars:
+            var = square_sums / x.shape[0] - mean**2
+            std = np.sqrt(np.maximum(var, 1e-10))
+            x = np.divide(x, std)
+        return x

modules/voice_conversion/fairseq/data/audio/frm_text_to_speech_dataset.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.abs
+import csv
+import logging
+import os.path as op
+from typing import List, Optional
+import numpy as np
+import torch
+from fairseq.data import Dictionary
+from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig
+from fairseq.data.audio.text_to_speech_dataset import (
+    TextToSpeechDataset,
+    TextToSpeechDatasetCreator,
+)
+logger = logging.getLogger(__name__)
+class FrmTextToSpeechDataset(TextToSpeechDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        data_cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        do_chunk=False,
+        chunk_bound=-1,
+        chunk_init=50,
+        chunk_incr=5,
+        add_eos=True,
+        dedup=True,
+        ref_fpu=-1,
+    ):
+        # It assumes texts are encoded at a fixed frame-rate
+        super().__init__(
+            split=split,
+            is_train_split=is_train_split,
+            data_cfg=data_cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+        self.do_chunk = do_chunk
+        self.chunk_bound = chunk_bound
+        self.chunk_init = chunk_init
+        self.chunk_incr = chunk_incr
+        self.add_eos = add_eos
+        self.dedup = dedup
+        self.ref_fpu = ref_fpu
+        self.chunk_size = -1
+        if do_chunk:
+            assert self.chunk_incr >= 0
+            assert self.pre_tokenizer is None
+    def __getitem__(self, index):
+        index, source, target, speaker_id, _, _, _ = super().__getitem__(index)
+        if target[-1].item() == self.tgt_dict.eos_index:
+            target = target[:-1]
+        fpu = source.size(0) / target.size(0)  # frame-per-unit
+        fps = self.n_frames_per_step
+        assert (
+            self.ref_fpu == -1 or abs((fpu * fps - self.ref_fpu) / self.ref_fpu) < 0.1
+        ), f"{fpu*fps} != {self.ref_fpu}"
+        # only chunk training split
+        if self.is_train_split and self.do_chunk and self.chunk_size > 0:
+            lang = target[: int(self.data_cfg.prepend_tgt_lang_tag)]
+            text = target[int(self.data_cfg.prepend_tgt_lang_tag) :]
+            size = len(text)
+            chunk_size = min(self.chunk_size, size)
+            chunk_start = np.random.randint(size - chunk_size + 1)
+            text = text[chunk_start : chunk_start + chunk_size]
+            target = torch.cat((lang, text), 0)
+            f_size = int(np.floor(chunk_size * fpu))
+            f_start = int(np.floor(chunk_start * fpu))
+            assert f_size > 0
+            source = source[f_start : f_start + f_size, :]
+        if self.dedup:
+            target = torch.unique_consecutive(target)
+        if self.add_eos:
+            eos_idx = self.tgt_dict.eos_index
+            target = torch.cat((target, torch.LongTensor([eos_idx])), 0)
+        return index, source, target, speaker_id
+    def set_epoch(self, epoch):
+        if self.is_train_split and self.do_chunk:
+            old = self.chunk_size
+            self.chunk_size = self.chunk_init + epoch * self.chunk_incr
+            if self.chunk_bound > 0:
+                self.chunk_size = min(self.chunk_size, self.chunk_bound)
+            logger.info(
+                (
+                    f"{self.split}: setting chunk size "
+                    f"from {old} to {self.chunk_size}"
+                )
+            )
+class FrmTextToSpeechDatasetCreator(TextToSpeechDatasetCreator):
+    # inherit for key names
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        data_cfg: S2TDataConfig,
+        split: str,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        is_train_split: bool,
+        n_frames_per_step: int,
+        speaker_to_id,
+        do_chunk: bool = False,
+        chunk_bound: int = -1,
+        chunk_init: int = 50,
+        chunk_incr: int = 5,
+        add_eos: bool = True,
+        dedup: bool = True,
+        ref_fpu: float = -1,
+    ) -> FrmTextToSpeechDataset:
+        tsv_path = op.join(root, f"{split}.tsv")
+        if not op.isfile(tsv_path):
+            raise FileNotFoundError(f"Dataset not found: {tsv_path}")
+        with open(tsv_path) as f:
+            reader = csv.DictReader(
+                f,
+                delimiter="\t",
+                quotechar=None,
+                doublequote=False,
+                lineterminator="\n",
+                quoting=csv.QUOTE_NONE,
+            )
+            s = [dict(e) for e in reader]
+            assert len(s) > 0
+        ids = [ss[cls.KEY_ID] for ss in s]
+        audio_paths = [op.join(data_cfg.audio_root, ss[cls.KEY_AUDIO]) for ss in s]
+        n_frames = [int(ss[cls.KEY_N_FRAMES]) for ss in s]
+        tgt_texts = [ss[cls.KEY_TGT_TEXT] for ss in s]
+        src_texts = [ss.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for ss in s]
+        speakers = [ss.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for ss in s]
+        src_langs = [ss.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for ss in s]
+        tgt_langs = [ss.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for ss in s]
+        return FrmTextToSpeechDataset(
+            split=split,
+            is_train_split=is_train_split,
+            data_cfg=data_cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+            do_chunk=do_chunk,
+            chunk_bound=chunk_bound,
+            chunk_init=chunk_init,
+            chunk_incr=chunk_incr,
+            add_eos=add_eos,
+            dedup=dedup,
+            ref_fpu=ref_fpu,
+        )

modules/voice_conversion/fairseq/data/audio/hubert_dataset.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import logging
+import os
+import sys
+from typing import Any, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.data import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+from fairseq.data.audio.audio_utils import (
+    parse_path,
+    read_from_stored_zip,
+)
+import io
+logger = logging.getLogger(__name__)
+def load_audio(manifest_path, max_keep, min_keep):
+    n_long, n_short = 0, 0
+    names, inds, sizes = [], [], []
+    with open(manifest_path) as f:
+        root = f.readline().strip()
+        for ind, line in enumerate(f):
+            items = line.strip().split("\t")
+            assert len(items) == 2, line
+            sz = int(items[1])
+            if min_keep is not None and sz < min_keep:
+                n_short += 1
+            elif max_keep is not None and sz > max_keep:
+                n_long += 1
+            else:
+                names.append(items[0])
+                inds.append(ind)
+                sizes.append(sz)
+    tot = ind + 1
+    logger.info(
+        (
+            f"max_keep={max_keep}, min_keep={min_keep}, "
+            f"loaded {len(names)}, skipped {n_short} short and {n_long} long, "
+            f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}"
+        )
+    )
+    return root, names, inds, tot, sizes
+def load_label(label_path, inds, tot):
+    with open(label_path) as f:
+        labels = [line.rstrip() for line in f]
+        assert (
+            len(labels) == tot
+        ), f"number of labels does not match ({len(labels)} != {tot})"
+        labels = [labels[i] for i in inds]
+    return labels
+def load_label_offset(label_path, inds, tot):
+    with open(label_path) as f:
+        code_lengths = [len(line.encode("utf-8")) for line in f]
+        assert (
+            len(code_lengths) == tot
+        ), f"number of labels does not match ({len(code_lengths)} != {tot})"
+        offsets = list(itertools.accumulate([0] + code_lengths))
+        offsets = [(offsets[i], offsets[i + 1]) for i in inds]
+    return offsets
+def verify_label_lengths(
+    audio_sizes,
+    audio_rate,
+    label_path,
+    label_rate,
+    inds,
+    tot,
+    tol=0.1,  # tolerance in seconds
+):
+    if label_rate < 0:
+        logger.info(f"{label_path} is sequence label. skipped")
+        return
+    with open(label_path) as f:
+        lengths = [len(line.rstrip().split()) for line in f]
+        assert len(lengths) == tot
+        lengths = [lengths[i] for i in inds]
+    num_invalid = 0
+    for i, ind in enumerate(inds):
+        dur_from_audio = audio_sizes[i] / audio_rate
+        dur_from_label = lengths[i] / label_rate
+        if abs(dur_from_audio - dur_from_label) > tol:
+            logger.warning(
+                (
+                    f"audio and label duration differ too much "
+                    f"(|{dur_from_audio} - {dur_from_label}| > {tol}) "
+                    f"in line {ind+1} of {label_path}. Check if `label_rate` "
+                    f"is correctly set (currently {label_rate}). "
+                    f"num. of samples = {audio_sizes[i]}; "
+                    f"label length = {lengths[i]}"
+                )
+            )
+            num_invalid += 1
+    if num_invalid > 0:
+        logger.warning(
+            f"total {num_invalid} (audio, label) pairs with mismatched lengths"
+        )
+class HubertDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest_path: str,
+        sample_rate: float,
+        label_paths: List[str],
+        label_rates: Union[List[float], float],  # -1 for sequence labels
+        pad_list: List[str],
+        eos_list: List[str],
+        label_processors: Optional[List[Any]] = None,
+        max_keep_sample_size: Optional[int] = None,
+        min_keep_sample_size: Optional[int] = None,
+        max_sample_size: Optional[int] = None,
+        shuffle: bool = True,
+        pad_audio: bool = False,
+        normalize: bool = False,
+        store_labels: bool = True,
+        random_crop: bool = False,
+        single_target: bool = False,
+    ):
+        self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio(
+            manifest_path, max_keep_sample_size, min_keep_sample_size
+        )
+        self.sample_rate = sample_rate
+        self.shuffle = shuffle
+        self.random_crop = random_crop
+        self.num_labels = len(label_paths)
+        self.pad_list = pad_list
+        self.eos_list = eos_list
+        self.label_processors = label_processors
+        self.single_target = single_target
+        self.label_rates = (
+            [label_rates for _ in range(len(label_paths))]
+            if isinstance(label_rates, float)
+            else label_rates
+        )
+        self.store_labels = store_labels
+        if store_labels:
+            self.label_list = [load_label(p, inds, tot) for p in label_paths]
+        else:
+            self.label_paths = label_paths
+            self.label_offsets_list = [
+                load_label_offset(p, inds, tot) for p in label_paths
+            ]
+        assert label_processors is None or len(label_processors) == self.num_labels
+        for label_path, label_rate in zip(label_paths, self.label_rates):
+            verify_label_lengths(
+                self.sizes, sample_rate, label_path, label_rate, inds, tot
+            )
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.pad_audio = pad_audio
+        self.normalize = normalize
+        logger.info(
+            f"pad_audio={pad_audio}, random_crop={random_crop}, "
+            f"normalize={normalize}, max_sample_size={self.max_sample_size}"
+        )
+    def get_audio(self, index):
+        import soundfile as sf
+        wav_path = os.path.join(self.audio_root, self.audio_names[index])
+        _path, slice_ptr = parse_path(wav_path)
+        if len(slice_ptr) == 0:
+            wav, cur_sample_rate = sf.read(_path)
+        else:
+            assert _path.endswith(".zip")
+            data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
+            f = io.BytesIO(data)
+            wav, cur_sample_rate = sf.read(f)
+        wav = torch.from_numpy(wav).float()
+        wav = self.postprocess(wav, cur_sample_rate)
+        return wav
+    def get_label(self, index, label_idx):
+        if self.store_labels:
+            label = self.label_list[label_idx][index]
+        else:
+            with open(self.label_paths[label_idx]) as f:
+                offset_s, offset_e = self.label_offsets_list[label_idx][index]
+                f.seek(offset_s)
+                label = f.read(offset_e - offset_s)
+        if self.label_processors is not None:
+            label = self.label_processors[label_idx](label)
+        return label
+    def get_labels(self, index):
+        return [self.get_label(index, i) for i in range(self.num_labels)]
+    def __getitem__(self, index):
+        wav = self.get_audio(index)
+        labels = self.get_labels(index)
+        return {"id": index, "source": wav, "label_list": labels}
+    def __len__(self):
+        return len(self.sizes)
+    def crop_to_max_size(self, wav, target_size):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav, 0
+        start, end = 0, target_size
+        if self.random_crop:
+            start = np.random.randint(0, diff + 1)
+            end = size - diff + start
+        return wav[start:end], start
+    def collater(self, samples):
+        # target = max(sizes) -> random_crop not used
+        # target = max_sample_size -> random_crop used for long
+        samples = [s for s in samples if s["source"] is not None]
+        if len(samples) == 0:
+            return {}
+        audios = [s["source"] for s in samples]
+        audio_sizes = [len(s) for s in audios]
+        if self.pad_audio:
+            audio_size = min(max(audio_sizes), self.max_sample_size)
+        else:
+            audio_size = min(min(audio_sizes), self.max_sample_size)
+        collated_audios, padding_mask, audio_starts = self.collater_audio(
+            audios, audio_size
+        )
+        targets_by_label = [
+            [s["label_list"][i] for s in samples] for i in range(self.num_labels)
+        ]
+        targets_list, lengths_list, ntokens_list = self.collater_label(
+            targets_by_label, audio_size, audio_starts
+        )
+        net_input = {"source": collated_audios, "padding_mask": padding_mask}
+        batch = {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "net_input": net_input,
+        }
+        if self.single_target:
+            batch["target_lengths"] = lengths_list[0]
+            batch["ntokens"] = ntokens_list[0]
+            batch["target"] = targets_list[0]
+        else:
+            batch["target_lengths_list"] = lengths_list
+            batch["ntokens_list"] = ntokens_list
+            batch["target_list"] = targets_list
+        return batch
+    def collater_audio(self, audios, audio_size):
+        collated_audios = audios[0].new_zeros(len(audios), audio_size)
+        padding_mask = (
+            torch.BoolTensor(collated_audios.shape).fill_(False)
+            # if self.pad_audio else None
+        )
+        audio_starts = [0 for _ in audios]
+        for i, audio in enumerate(audios):
+            diff = len(audio) - audio_size
+            if diff == 0:
+                collated_audios[i] = audio
+            elif diff < 0:
+                assert self.pad_audio
+                collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)])
+                padding_mask[i, diff:] = True
+            else:
+                collated_audios[i], audio_starts[i] = self.crop_to_max_size(
+                    audio, audio_size
+                )
+        return collated_audios, padding_mask, audio_starts
+    def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
+        assert label_rate > 0
+        s2f = label_rate / self.sample_rate
+        frm_starts = [int(round(s * s2f)) for s in audio_starts]
+        frm_size = int(round(audio_size * s2f))
+        if not self.pad_audio:
+            rem_size = [len(t) - s for t, s in zip(targets, frm_starts)]
+            frm_size = min(frm_size, *rem_size)
+        targets = [t[s : s + frm_size] for t, s in zip(targets, frm_starts)]
+        logger.debug(f"audio_starts={audio_starts}")
+        logger.debug(f"frame_starts={frm_starts}")
+        logger.debug(f"frame_size={frm_size}")
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False)
+        return targets, lengths, ntokens
+    def collater_seq_label(self, targets, pad):
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False)
+        return targets, lengths, ntokens
+    def collater_label(self, targets_by_label, audio_size, audio_starts):
+        targets_list, lengths_list, ntokens_list = [], [], []
+        itr = zip(targets_by_label, self.label_rates, self.pad_list)
+        for targets, label_rate, pad in itr:
+            if label_rate == -1.0:
+                targets, lengths, ntokens = self.collater_seq_label(targets, pad)
+            else:
+                targets, lengths, ntokens = self.collater_frm_label(
+                    targets, audio_size, audio_starts, label_rate, pad
+                )
+            targets_list.append(targets)
+            lengths_list.append(lengths)
+            ntokens_list.append(ntokens)
+        return targets_list, lengths_list, ntokens_list
+    def num_tokens(self, index):
+        return self.size(index)
+    def size(self, index):
+        if self.pad_audio:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)[::-1]
+    def postprocess(self, wav, cur_sample_rate):
+        if wav.dim() == 2:
+            wav = wav.mean(-1)
+        assert wav.dim() == 1, wav.dim()
+        if cur_sample_rate != self.sample_rate:
+            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
+        if self.normalize:
+            with torch.no_grad():
+                wav = F.layer_norm(wav, wav.shape)
+        return wav

modules/voice_conversion/fairseq/data/audio/multi_modality_dataset.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright (c) 2021-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+import logging
+import math
+from typing import List, Optional, NamedTuple
+import numpy as np
+from fairseq.data.resampling_dataset import ResamplingDataset
+import torch
+from fairseq.data import (
+    ConcatDataset,
+    LanguagePairDataset,
+    FileAudioDataset,
+    data_utils,
+)
+from fairseq.data import FairseqDataset
+logger = logging.getLogger(__name__)
+class ModalityDatasetItem(NamedTuple):
+    datasetname: str
+    dataset: any
+    max_positions: List[int]
+    max_tokens: Optional[int] = None
+    max_sentences: Optional[int] = None
+def resampling_dataset_present(ds):
+    if isinstance(ds, ResamplingDataset):
+        return True
+    if isinstance(ds, ConcatDataset):
+        return any(resampling_dataset_present(d) for d in ds.datasets)
+    if hasattr(ds, "dataset"):
+        return resampling_dataset_present(ds.dataset)
+    return False
+# MultiModalityDataset: it concate multiple datasets with different modalities.
+# Compared with ConcatDataset it can 1) sample data given the ratios for different datasets
+# 2) it adds mode to indicate what type of the data samples come from.
+# It will be used with GroupedEpochBatchIterator together to generate mini-batch with samples
+# from the same type of dataset
+# If only one dataset is used, it will perform like the original dataset with mode added
+class MultiModalityDataset(ConcatDataset):
+    def __init__(self, datasets: List[ModalityDatasetItem]):
+        id_to_mode = []
+        dsets = []
+        max_tokens = []
+        max_sentences = []
+        max_positions = []
+        for dset in datasets:
+            id_to_mode.append(dset.datasetname)
+            dsets.append(dset.dataset)
+            max_tokens.append(dset.max_tokens)
+            max_positions.append(dset.max_positions)
+            max_sentences.append(dset.max_sentences)
+        weights = [1.0 for s in dsets]
+        super().__init__(dsets, weights)
+        self.max_tokens = max_tokens
+        self.max_positions = max_positions
+        self.max_sentences = max_sentences
+        self.id_to_mode = id_to_mode
+        self.raw_sub_batch_samplers = []
+        self._cur_epoch = 0
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        self._cur_epoch = epoch
+    def __getitem__(self, idx):
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        sample = self.datasets[dataset_idx][sample_idx]
+        return (dataset_idx, sample)
+    def collater(self, samples):
+        if len(samples) == 0:
+            return {}
+        dataset_idx = samples[0][0]
+        # make sure all samples in samples are from same dataset
+        assert sum([0 if dataset_idx == s[0] else 1 for s in samples]) == 0
+        samples = self.datasets[dataset_idx].collater([x[1] for x in samples])
+        # add mode
+        samples["net_input"]["mode"] = self.id_to_mode[dataset_idx]
+        return samples
+    def size(self, index: int):
+        if len(self.datasets) == 1:
+            return self.datasets[0].size(index)
+        return super().size(index)
+    @property
+    def sizes(self):
+        if len(self.datasets) == 1:
+            return self.datasets[0].sizes
+        return super().sizes
+    def ordered_indices(self):
+        """
+        Returns indices sorted by length. So less padding is needed.
+        """
+        if len(self.datasets) == 1:
+            return self.datasets[0].ordered_indices()
+        indices_group = []
+        for d_idx, ds in enumerate(self.datasets):
+            sample_num = self.cumulative_sizes[d_idx]
+            if d_idx > 0:
+                sample_num = sample_num - self.cumulative_sizes[d_idx - 1]
+            assert sample_num == len(ds)
+            indices_group.append(ds.ordered_indices())
+        return indices_group
+    def get_raw_batch_samplers(self, required_batch_size_multiple, seed):
+        with data_utils.numpy_seed(seed):
+            indices = self.ordered_indices()
+        for i, ds in enumerate(self.datasets):
+            # If we have ResamplingDataset, the same id can correpond to a different
+            # sample in the next epoch, so we need to rebuild this at every epoch
+            if i < len(self.raw_sub_batch_samplers) and not resampling_dataset_present(
+                ds
+            ):
+                logger.info(f"dataset {i} is valid and it is not re-sampled")
+                continue
+            indices[i] = ds.filter_indices_by_size(
+                indices[i],
+                self.max_positions[i],
+            )[0]
+            sub_batch_sampler = ds.batch_by_size(
+                indices[i],
+                max_tokens=self.max_tokens[i],
+                max_sentences=self.max_sentences[i],
+                required_batch_size_multiple=required_batch_size_multiple,
+            )
+            if i < len(self.raw_sub_batch_samplers):
+                self.raw_sub_batch_samplers[i] = sub_batch_sampler
+            else:
+                self.raw_sub_batch_samplers.append(sub_batch_sampler)
+    def get_batch_samplers(self, mult_ratios, required_batch_size_multiple, seed):
+        self.get_raw_batch_samplers(required_batch_size_multiple, seed)
+        batch_samplers = []
+        for i, _ in enumerate(self.datasets):
+            if i > 0:
+                sub_batch_sampler = [
+                    [y + self.cumulative_sizes[i - 1] for y in x]
+                    for x in self.raw_sub_batch_samplers[i]
+                ]
+            else:
+                sub_batch_sampler = list(self.raw_sub_batch_samplers[i])
+            smp_r = mult_ratios[i]
+            if smp_r != 1:
+                is_increase = "increased" if smp_r > 1 else "decreased"
+                logger.info(
+                    "number of batch for the dataset {} is {} from {} to {}".format(
+                        self.id_to_mode[i],
+                        is_increase,
+                        len(sub_batch_sampler),
+                        int(len(sub_batch_sampler) * smp_r),
+                    )
+                )
+                mul_samplers = []
+                for _ in range(math.floor(smp_r)):
+                    mul_samplers = mul_samplers + sub_batch_sampler
+                if math.floor(smp_r) != smp_r:
+                    with data_utils.numpy_seed(seed + self._cur_epoch):
+                        np.random.shuffle(sub_batch_sampler)
+                        smp_num = int(
+                            (smp_r - math.floor(smp_r)) * len(sub_batch_sampler)
+                        )
+                    mul_samplers = mul_samplers + sub_batch_sampler[:smp_num]
+                sub_batch_sampler = mul_samplers
+            else:
+                logger.info(
+                    "dataset {} batch number is {} ".format(
+                        self.id_to_mode[i], len(sub_batch_sampler)
+                    )
+                )
+            batch_samplers.append(sub_batch_sampler)
+        return batch_samplers
+class LangPairMaskDataset(FairseqDataset):
+    def __init__(
+        self,
+        dataset: LanguagePairDataset,
+        src_eos: int,
+        src_bos: Optional[int] = None,
+        noise_id: Optional[int] = -1,
+        mask_ratio: Optional[float] = 0,
+        mask_type: Optional[str] = "random",
+    ):
+        self.dataset = dataset
+        self.src_eos = src_eos
+        self.src_bos = src_bos
+        self.noise_id = noise_id
+        self.mask_ratio = mask_ratio
+        self.mask_type = mask_type
+        assert mask_type in ("random", "tail")
+    @property
+    def src_sizes(self):
+        return self.dataset.src_sizes
+    @property
+    def tgt_sizes(self):
+        return self.dataset.tgt_sizes
+    @property
+    def sizes(self):
+        # dataset.sizes can be a dynamically computed sizes:
+        return self.dataset.sizes
+    def get_batch_shapes(self):
+        if hasattr(self.dataset, "get_batch_shapes"):
+            return self.dataset.get_batch_shapes()
+        return self.dataset.buckets
+    def num_tokens_vec(self, indices):
+        return self.dataset.num_tokens_vec(indices)
+    def __len__(self):
+        return len(self.dataset)
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+    def size(self, index):
+        return self.dataset.size(index)
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
+    def mask_src_tokens(self, sample):
+        src_item = sample["source"]
+        mask = None
+        if self.mask_type == "random":
+            mask = torch.rand(len(src_item)).le(self.mask_ratio)
+        else:
+            mask = torch.ones(len(src_item))
+            mask[: int(len(src_item) * (1 - self.mask_ratio))] = 0
+            mask = mask.eq(1)
+        if src_item[0] == self.src_bos:
+            mask[0] = False
+        if src_item[-1] == self.src_eos:
+            mask[-1] = False
+        mask_src_item = src_item.masked_fill(mask, self.noise_id)
+        smp = {"id": sample["id"], "source": mask_src_item, "target": sample["target"]}
+        return smp
+    def __getitem__(self, index):
+        sample = self.dataset[index]
+        if self.mask_ratio > 0:
+            sample = self.mask_src_tokens(sample)
+        return sample
+    def collater(self, samples, pad_to_length=None):
+        return self.dataset.collater(samples, pad_to_length)
+class FileAudioDatasetWrapper(FileAudioDataset):
+    def collater(self, samples):
+        samples = super().collater(samples)
+        if len(samples) == 0:
+            return {}
+        samples["net_input"]["src_tokens"] = samples["net_input"]["source"]
+        samples["net_input"]["prev_output_tokens"] = None
+        del samples["net_input"]["source"]
+        samples["net_input"]["src_lengths"] = None
+        samples["net_input"]["alignment"] = None
+        return samples

modules/voice_conversion/fairseq/data/audio/raw_audio_dataset.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import sys
+import io
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .. import FairseqDataset
+from ..data_utils import compute_mask_indices, get_buckets, get_bucketed_sizes
+from fairseq.data.audio.audio_utils import (
+    parse_path,
+    read_from_stored_zip,
+    is_sf_audio_data,
+)
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+logger = logging.getLogger(__name__)
+class RawAudioDataset(FairseqDataset):
+    def __init__(
+        self,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=0,
+        shuffle=True,
+        pad=False,
+        normalize=False,
+        compute_mask_indices=False,
+        **mask_compute_kwargs,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.sizes = []
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.min_sample_size = min_sample_size
+        self.pad = pad
+        self.shuffle = shuffle
+        self.normalize = normalize
+        self.compute_mask_indices = compute_mask_indices
+        if self.compute_mask_indices:
+            self.mask_compute_kwargs = mask_compute_kwargs
+            self._features_size_map = {}
+            self._C = mask_compute_kwargs["encoder_embed_dim"]
+            self._conv_feature_layers = eval(mask_compute_kwargs["conv_feature_layers"])
+    def __getitem__(self, index):
+        raise NotImplementedError()
+    def __len__(self):
+        return len(self.sizes)
+    def postprocess(self, feats, curr_sample_rate):
+        if feats.dim() == 2:
+            feats = feats.mean(-1)
+        if curr_sample_rate != self.sample_rate:
+            raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}")
+        assert feats.dim() == 1, feats.dim()
+        if self.normalize:
+            with torch.no_grad():
+                feats = F.layer_norm(feats, feats.shape)
+        return feats
+    def crop_to_max_size(self, wav, target_size):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav
+        start = np.random.randint(0, diff + 1)
+        end = size - diff + start
+        return wav[start:end]
+    def _compute_mask_indices(self, dims, padding_mask):
+        B, T, C = dims
+        mask_indices, mask_channel_indices = None, None
+        if self.mask_compute_kwargs["mask_prob"] > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_compute_kwargs["mask_prob"],
+                self.mask_compute_kwargs["mask_length"],
+                self.mask_compute_kwargs["mask_selection"],
+                self.mask_compute_kwargs["mask_other"],
+                min_masks=2,
+                no_overlap=self.mask_compute_kwargs["no_mask_overlap"],
+                min_space=self.mask_compute_kwargs["mask_min_space"],
+            )
+            mask_indices = torch.from_numpy(mask_indices)
+        if self.mask_compute_kwargs["mask_channel_prob"] > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_compute_kwargs["mask_channel_prob"],
+                self.mask_compute_kwargs["mask_channel_length"],
+                self.mask_compute_kwargs["mask_channel_selection"],
+                self.mask_compute_kwargs["mask_channel_other"],
+                no_overlap=self.mask_compute_kwargs["no_mask_channel_overlap"],
+                min_space=self.mask_compute_kwargs["mask_channel_min_space"],
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices).unsqueeze(1).expand(-1, T, -1)
+            )
+        return mask_indices, mask_channel_indices
+    @staticmethod
+    def _bucket_tensor(tensor, num_pad, value):
+        return F.pad(tensor, (0, num_pad), value=value)
+    def collater(self, samples):
+        samples = [s for s in samples if s["source"] is not None]
+        if len(samples) == 0:
+            return {}
+        sources = [s["source"] for s in samples]
+        sizes = [len(s) for s in sources]
+        if self.pad:
+            target_size = min(max(sizes), self.max_sample_size)
+        else:
+            target_size = min(min(sizes), self.max_sample_size)
+        collated_sources = sources[0].new_zeros(len(sources), target_size)
+        padding_mask = (
+            torch.BoolTensor(collated_sources.shape).fill_(False) if self.pad else None
+        )
+        for i, (source, size) in enumerate(zip(sources, sizes)):
+            diff = size - target_size
+            if diff == 0:
+                collated_sources[i] = source
+            elif diff < 0:
+                assert self.pad
+                collated_sources[i] = torch.cat(
+                    [source, source.new_full((-diff,), 0.0)]
+                )
+                padding_mask[i, diff:] = True
+            else:
+                collated_sources[i] = self.crop_to_max_size(source, target_size)
+        input = {"source": collated_sources}
+        out = {"id": torch.LongTensor([s["id"] for s in samples])}
+        if self.pad:
+            input["padding_mask"] = padding_mask
+        if hasattr(self, "num_buckets") and self.num_buckets > 0:
+            assert self.pad, "Cannot bucket without padding first."
+            bucket = max(self._bucketed_sizes[s["id"]] for s in samples)
+            num_pad = bucket - collated_sources.size(-1)
+            if num_pad:
+                input["source"] = self._bucket_tensor(collated_sources, num_pad, 0)
+                input["padding_mask"] = self._bucket_tensor(padding_mask, num_pad, True)
+        if self.compute_mask_indices:
+            B = input["source"].size(0)
+            T = self._get_mask_indices_dims(input["source"].size(-1))
+            padding_mask_reshaped = input["padding_mask"].clone()
+            extra = padding_mask_reshaped.size(1) % T
+            if extra > 0:
+                padding_mask_reshaped = padding_mask_reshaped[:, :-extra]
+            padding_mask_reshaped = padding_mask_reshaped.view(
+                padding_mask_reshaped.size(0), T, -1
+            )
+            padding_mask_reshaped = padding_mask_reshaped.all(-1)
+            input["padding_count"] = padding_mask_reshaped.sum(-1).max().item()
+            mask_indices, mask_channel_indices = self._compute_mask_indices(
+                (B, T, self._C),
+                padding_mask_reshaped,
+            )
+            input["mask_indices"] = mask_indices
+            input["mask_channel_indices"] = mask_channel_indices
+            out["sample_size"] = mask_indices.sum().item()
+        out["net_input"] = input
+        return out
+    def _get_mask_indices_dims(self, size, padding=0, dilation=1):
+        if size not in self._features_size_map:
+            L_in = size
+            for (_, kernel_size, stride) in self._conv_feature_layers:
+                L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+                L_out = 1 + L_out // stride
+                L_in = L_out
+            self._features_size_map[size] = L_out
+        return self._features_size_map[size]
+    def num_tokens(self, index):
+        return self.size(index)
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        if self.pad:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+            order.append(
+                np.minimum(
+                    np.array(self.sizes),
+                    self.max_sample_size,
+                )
+            )
+            return np.lexsort(order)[::-1]
+        else:
+            return np.arange(len(self))
+    def set_bucket_info(self, num_buckets):
+        self.num_buckets = num_buckets
+        if self.num_buckets > 0:
+            self._collated_sizes = np.minimum(
+                np.array(self.sizes),
+                self.max_sample_size,
+            )
+            self.buckets = get_buckets(
+                self._collated_sizes,
+                self.num_buckets,
+            )
+            self._bucketed_sizes = get_bucketed_sizes(
+                self._collated_sizes, self.buckets
+            )
+            logger.info(
+                f"{len(self.buckets)} bucket(s) for the audio dataset: "
+                f"{self.buckets}"
+            )
+class FileAudioDataset(RawAudioDataset):
+    def __init__(
+        self,
+        manifest_path,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=0,
+        shuffle=True,
+        pad=False,
+        normalize=False,
+        num_buckets=0,
+        compute_mask_indices=False,
+        text_compression_level=TextCompressionLevel.none,
+        **mask_compute_kwargs,
+    ):
+        super().__init__(
+            sample_rate=sample_rate,
+            max_sample_size=max_sample_size,
+            min_sample_size=min_sample_size,
+            shuffle=shuffle,
+            pad=pad,
+            normalize=normalize,
+            compute_mask_indices=compute_mask_indices,
+            **mask_compute_kwargs,
+        )
+        self.text_compressor = TextCompressor(level=text_compression_level)
+        skipped = 0
+        self.fnames = []
+        sizes = []
+        self.skipped_indices = set()
+        with open(manifest_path, "r") as f:
+            self.root_dir = f.readline().strip()
+            for i, line in enumerate(f):
+                items = line.strip().split("\t")
+                assert len(items) == 2, line
+                sz = int(items[1])
+                if min_sample_size is not None and sz < min_sample_size:
+                    skipped += 1
+                    self.skipped_indices.add(i)
+                    continue
+                self.fnames.append(self.text_compressor.compress(items[0]))
+                sizes.append(sz)
+        logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples")
+        self.sizes = np.array(sizes, dtype=np.int64)
+        try:
+            import pyarrow
+            self.fnames = pyarrow.array(self.fnames)
+        except:
+            logger.debug(
+                "Could not create a pyarrow array. Please install pyarrow for better performance"
+            )
+            pass
+        self.set_bucket_info(num_buckets)
+    def __getitem__(self, index):
+        import soundfile as sf
+        fn = self.fnames[index]
+        fn = fn if isinstance(self.fnames, list) else fn.as_py()
+        fn = self.text_compressor.decompress(fn)
+        path_or_fp = os.path.join(self.root_dir, fn)
+        _path, slice_ptr = parse_path(path_or_fp)
+        if len(slice_ptr) == 2:
+            byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
+            assert is_sf_audio_data(byte_data)
+            path_or_fp = io.BytesIO(byte_data)
+        wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32")
+        feats = torch.from_numpy(wav).float()
+        feats = self.postprocess(feats, curr_sample_rate)
+        return {"id": index, "source": feats}
+class BinarizedAudioDataset(RawAudioDataset):
+    def __init__(
+        self,
+        data_dir,
+        split,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=0,
+        shuffle=True,
+        pad=False,
+        normalize=False,
+        num_buckets=0,
+        compute_mask_indices=False,
+        **mask_compute_kwargs,
+    ):
+        super().__init__(
+            sample_rate=sample_rate,
+            max_sample_size=max_sample_size,
+            min_sample_size=min_sample_size,
+            shuffle=shuffle,
+            pad=pad,
+            normalize=normalize,
+            compute_mask_indices=compute_mask_indices,
+            **mask_compute_kwargs,
+        )
+        from fairseq.data import data_utils, Dictionary
+        self.fnames_dict = Dictionary.load(os.path.join(data_dir, "dict.txt"))
+        root_path = os.path.join(data_dir, f"{split}.root")
+        if os.path.exists(root_path):
+            with open(root_path, "r") as f:
+                self.root_dir = next(f).strip()
+        else:
+            self.root_dir = None
+        fnames_path = os.path.join(data_dir, split)
+        self.fnames = data_utils.load_indexed_dataset(fnames_path, self.fnames_dict)
+        lengths_path = os.path.join(data_dir, f"{split}.lengths")
+        with open(lengths_path, "r") as f:
+            for line in f:
+                sz = int(line.rstrip())
+                assert (
+                    sz >= min_sample_size
+                ), f"Min sample size is not supported for binarized dataset, but found a sample with size {sz}"
+                self.sizes.append(sz)
+        self.sizes = np.array(self.sizes, dtype=np.int64)
+        self.set_bucket_info(num_buckets)
+        logger.info(f"loaded {len(self.fnames)} samples")
+    def __getitem__(self, index):
+        import soundfile as sf
+        fname = self.fnames_dict.string(self.fnames[index], separator="")
+        if self.root_dir:
+            fname = os.path.join(self.root_dir, fname)
+        wav, curr_sample_rate = sf.read(fname)
+        feats = torch.from_numpy(wav).float()
+        feats = self.postprocess(feats, curr_sample_rate)
+        return {"id": index, "source": feats}

modules/voice_conversion/fairseq/data/audio/speech_to_speech_dataset.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+from fairseq.data import ConcatDataset, Dictionary
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.data_cfg import S2SDataConfig
+from fairseq.data.audio.speech_to_text_dataset import (
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+    TextTargetMultitaskData,
+    _collate_frames,
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class SpeechToSpeechDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    target_speaker: Optional[torch.Tensor] = None
+    tgt_lang_tag: Optional[int] = None
+class SpeechToSpeechDataset(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        data_cfg: S2SDataConfig,
+        src_audio_paths: List[str],
+        src_n_frames: List[int],
+        tgt_audio_paths: List[str],
+        tgt_n_frames: List[int],
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        target_is_code: bool = False,
+        tgt_dict: Dictionary = None,
+        n_frames_per_step: int = 1,
+    ):
+        tgt_texts = tgt_audio_paths if target_is_code else None
+        super().__init__(
+            split=split,
+            is_train_split=is_train_split,
+            cfg=data_cfg,
+            audio_paths=src_audio_paths,
+            n_frames=src_n_frames,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            tgt_texts=tgt_texts,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            n_frames_per_step=n_frames_per_step,
+        )
+        self.tgt_audio_paths = tgt_audio_paths
+        self.tgt_lens = [t // self.n_frames_per_step for t in tgt_n_frames]
+        assert not target_is_code or tgt_dict is not None
+        self.target_is_code = target_is_code
+        assert len(tgt_audio_paths) == self.n_samples
+        assert len(tgt_n_frames) == self.n_samples
+        self.tgt_speakers = None
+        if self.cfg.target_speaker_embed:
+            samples = SpeechToTextDatasetCreator._load_samples_from_tsv(
+                self.cfg.target_speaker_embed, split
+            )
+            spk_emb_dict = {s["id"]: s["speaker_embed"] for s in samples}
+            self.tgt_speakers = [spk_emb_dict[id] for id in self.ids]
+            assert len(self.tgt_speakers) == self.n_samples
+        logger.info(self.__repr__())
+    def pack_units(self, input: torch.Tensor) -> torch.Tensor:
+        if self.n_frames_per_step <= 1:
+            return input
+        offset = 4
+        vocab_size = (
+            len(self.tgt_dict) - offset
+        )  # remove offset from <bos>, <pad>, <eos>, <unk>, which is specific to fairseq dictionary
+        assert input.dim() == 1
+        stacked_input = (
+            input[:-1].view(-1, self.n_frames_per_step) - offset
+        )  # remove <eos>
+        scale = [
+            pow(vocab_size, self.n_frames_per_step - 1 - i)
+            for i in range(self.n_frames_per_step)
+        ]
+        scale = torch.LongTensor(scale).squeeze(0)
+        res = input.new((len(input) - 1) // self.n_frames_per_step + 1).fill_(input[-1])
+        res[:-1] = (stacked_input * scale).sum(dim=1) + offset
+        return res
+    def __getitem__(self, index: int) -> SpeechToSpeechDatasetItem:
+        source = self._get_source_audio(index)
+        tgt_lang_tag = None
+        if self.cfg.prepend_tgt_lang_tag_as_bos:
+            # prepend_tgt_lang_tag_as_bos: put tgt_lang_tag as bos of target
+            tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+        if not self.target_is_code:
+            target = get_features_or_waveform(self.tgt_audio_paths[index])
+            target = torch.from_numpy(target).float()
+            target = self.pack_frames(target)
+        else:
+            target = self.tgt_dict.encode_line(
+                self.tgt_audio_paths[index],
+                add_if_not_exist=False,
+                append_eos=True,
+            ).long()
+            if self.n_frames_per_step > 1:
+                n_tgt_frame = target.size(0) - 1  # exclude <eos>
+                keep_n_tgt_frame = n_tgt_frame - n_tgt_frame % self.n_frames_per_step
+                target = torch.cat(
+                    (
+                        target[:keep_n_tgt_frame],
+                        target.new_full((1,), self.tgt_dict.eos()),
+                    ),
+                    dim=0,
+                )
+        if self.tgt_speakers:
+            tgt_spk = get_features_or_waveform(self.tgt_speakers[index])
+            tgt_spk = torch.from_numpy(tgt_spk).float()
+        else:
+            tgt_spk = torch.FloatTensor([])
+        return SpeechToSpeechDatasetItem(
+            index=index,
+            source=source,
+            target=target,
+            target_speaker=tgt_spk,
+            tgt_lang_tag=tgt_lang_tag,
+        )
+    def _collate_target(self, samples: List[SpeechToSpeechDatasetItem]) -> torch.Tensor:
+        if self.target_is_code:
+            target = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            # convert stacked units to a single id
+            pack_targets = [self.pack_units(x.target) for x in samples]
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                pack_targets,
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=True,
+            )
+            target_lengths = torch.tensor(
+                [x.size(0) for x in pack_targets], dtype=torch.long
+            )
+        else:
+            target = _collate_frames([x.target for x in samples], is_audio_input=False)
+            bsz, _, d = target.size()
+            prev_output_tokens = torch.cat(
+                (target.new_full((bsz, 1, d), 0.0), target[:, :-1, :]), dim=1
+            )
+            target_lengths = torch.tensor(
+                [x.target.size(0) for x in samples], dtype=torch.long
+            )
+        return target, prev_output_tokens, target_lengths
+    def collater(
+        self, samples: List[SpeechToSpeechDatasetItem], return_order: bool = False
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        indices = torch.tensor([x.index for x in samples], dtype=torch.long)
+        frames = _collate_frames([x.source for x in samples], self.cfg.use_audio_input)
+        # sort samples by descending number of frames
+        n_frames = torch.tensor([x.source.size(0) for x in samples], dtype=torch.long)
+        n_frames, order = n_frames.sort(descending=True)
+        indices = indices.index_select(0, order)
+        frames = frames.index_select(0, order)
+        target, prev_output_tokens, target_lengths = self._collate_target(samples)
+        target = target.index_select(0, order)
+        target_lengths = target_lengths.index_select(0, order)
+        prev_output_tokens = prev_output_tokens.index_select(0, order)
+        ntokens = sum(x.target.size(0) for x in samples)
+        tgt_speakers = None
+        if self.cfg.target_speaker_embed:
+            tgt_speakers = _collate_frames(
+                [x.target_speaker for x in samples], is_audio_input=True
+            ).index_select(0, order)
+        net_input = {
+            "src_tokens": frames,
+            "src_lengths": n_frames,
+            "prev_output_tokens": prev_output_tokens,
+            "tgt_speaker": tgt_speakers,  # TODO: unify "speaker" and "tgt_speaker"
+        }
+        if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None:
+            for i in range(len(samples)):
+                net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag
+        out = {
+            "id": indices,
+            "net_input": net_input,
+            "speaker": tgt_speakers,  # to support Tacotron2 loss for speech-to-spectrogram model
+            "target": target,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+            "nsentences": len(samples),
+        }
+        if return_order:
+            out["order"] = order
+        return out
+class SpeechToSpeechMultitaskDataset(SpeechToSpeechDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.multitask_data = {}
+    def add_multitask_dataset(self, task_name, task_data):
+        self.multitask_data[task_name] = task_data
+    def __getitem__(
+        self, index: int
+    ) -> Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]:
+        s2s_data = super().__getitem__(index)
+        multitask_target = {}
+        sample_id = self.ids[index]
+        tgt_lang = self.tgt_langs[index]
+        for task_name, task_dataset in self.multitask_data.items():
+            multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang)
+        return s2s_data, multitask_target
+    def collater(
+        self, samples: List[Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]]
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        out = super().collater([s for s, _ in samples], return_order=True)
+        order = out["order"]
+        del out["order"]
+        for task_name, task_dataset in self.multitask_data.items():
+            if "multitask" not in out:
+                out["multitask"] = {}
+            d = [s[task_name] for _, s in samples]
+            task_target = task_dataset.collater(d)
+            out["multitask"][task_name] = {
+                "target": task_target["target"].index_select(0, order),
+                "target_lengths": task_target["target_lengths"].index_select(0, order),
+                "ntokens": task_target["ntokens"],
+            }
+            out["multitask"][task_name]["net_input"] = {
+                "prev_output_tokens": task_target["prev_output_tokens"].index_select(
+                    0, order
+                ),
+            }
+        return out
+class SpeechToSpeechDatasetCreator(object):
+    # mandatory columns
+    KEY_ID, KEY_SRC_AUDIO, KEY_SRC_N_FRAMES = "id", "src_audio", "src_n_frames"
+    KEY_TGT_AUDIO, KEY_TGT_N_FRAMES = "tgt_audio", "tgt_n_frames"
+    # optional columns
+    KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang"
+    # default values
+    DEFAULT_LANG = ""
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        data_cfg: S2SDataConfig,
+        target_is_code: bool = False,
+        tgt_dict: Dictionary = None,
+        n_frames_per_step: int = 1,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToSpeechDataset:
+        audio_root = Path(data_cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        src_audio_paths = [
+            (audio_root / s[cls.KEY_SRC_AUDIO]).as_posix() for s in samples
+        ]
+        tgt_audio_paths = [
+            s[cls.KEY_TGT_AUDIO]
+            if target_is_code
+            else (audio_root / s[cls.KEY_TGT_AUDIO]).as_posix()
+            for s in samples
+        ]
+        src_n_frames = [int(s[cls.KEY_SRC_N_FRAMES]) for s in samples]
+        tgt_n_frames = [int(s[cls.KEY_TGT_N_FRAMES]) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        has_multitask = multitask is not None and len(multitask.keys()) > 0
+        dataset_cls = (
+            SpeechToSpeechMultitaskDataset if has_multitask else SpeechToSpeechDataset
+        )
+        ds = dataset_cls(
+            split=split_name,
+            is_train_split=is_train_split,
+            data_cfg=data_cfg,
+            src_audio_paths=src_audio_paths,
+            src_n_frames=src_n_frames,
+            tgt_audio_paths=tgt_audio_paths,
+            tgt_n_frames=tgt_n_frames,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            target_is_code=target_is_code,
+            tgt_dict=tgt_dict,
+            n_frames_per_step=n_frames_per_step,
+        )
+        if has_multitask:
+            for task_name, task_obj in multitask.items():
+                task_data = TextTargetMultitaskData(
+                    task_obj.args, split_name, task_obj.target_dictionary
+                )
+                ds.add_multitask_dataset(task_name, task_data)
+        return ds
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        data_cfg: S2SDataConfig,
+        splits: str,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        target_is_code: bool = False,
+        tgt_dict: Dictionary = None,
+        n_frames_per_step: int = 1,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToSpeechDataset:
+        datasets = []
+        for split in splits.split(","):
+            samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split)
+            ds = cls._from_list(
+                split_name=split,
+                is_train_split=is_train_split,
+                samples=samples,
+                data_cfg=data_cfg,
+                target_is_code=target_is_code,
+                tgt_dict=tgt_dict,
+                n_frames_per_step=n_frames_per_step,
+                multitask=multitask,
+            )
+            datasets.append(ds)
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]

modules/voice_conversion/fairseq/data/audio/speech_to_text_dataset.py ADDED Viewed

	@@ -0,0 +1,733 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+import logging
+import re
+from argparse import Namespace
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.data import ConcatDataset, Dictionary, FairseqDataset, ResamplingDataset
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data import encoders
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.data_cfg import S2TDataConfig
+from fairseq.data.audio.dataset_transforms import CompositeAudioDatasetTransform
+from fairseq.data.audio.dataset_transforms.concataugment import ConcatAugment
+from fairseq.data.audio.dataset_transforms.noisyoverlapaugment import (
+    NoisyOverlapAugment,
+)
+from fairseq.data.audio.feature_transforms import CompositeAudioFeatureTransform
+from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform
+logger = logging.getLogger(__name__)
+def _collate_frames(
+    frames: List[torch.Tensor], is_audio_input: bool = False
+) -> torch.Tensor:
+    """
+    Convert a list of 2D frames into a padded 3D tensor
+    Args:
+        frames (list): list of 2D frames of size L[i]*f_dim. Where L[i] is
+            length of i-th frame and f_dim is static dimension of features
+    Returns:
+        3D tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+    """
+    max_len = max(frame.size(0) for frame in frames)
+    if is_audio_input:
+        out = frames[0].new_zeros((len(frames), max_len))
+    else:
+        out = frames[0].new_zeros((len(frames), max_len, frames[0].size(1)))
+    for i, v in enumerate(frames):
+        out[i, : v.size(0)] = v
+    return out
+def _is_int_or_np_int(n):
+    return isinstance(n, int) or (
+        isinstance(n, np.generic) and isinstance(n.item(), int)
+    )
+@dataclass
+class SpeechToTextDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    speaker_id: Optional[int] = None
+class SpeechToTextDataset(FairseqDataset):
+    LANG_TAG_TEMPLATE = "<lang:{}>"
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        append_eos=True,
+    ):
+        self.split, self.is_train_split = split, is_train_split
+        self.cfg = cfg
+        self.audio_paths, self.n_frames = audio_paths, n_frames
+        self.n_samples = len(audio_paths)
+        assert len(n_frames) == self.n_samples > 0
+        assert src_texts is None or len(src_texts) == self.n_samples
+        assert tgt_texts is None or len(tgt_texts) == self.n_samples
+        assert speakers is None or len(speakers) == self.n_samples
+        assert src_langs is None or len(src_langs) == self.n_samples
+        assert tgt_langs is None or len(tgt_langs) == self.n_samples
+        assert ids is None or len(ids) == self.n_samples
+        assert (tgt_dict is None and tgt_texts is None) or (
+            tgt_dict is not None and tgt_texts is not None
+        )
+        self.src_texts, self.tgt_texts = src_texts, tgt_texts
+        self.src_langs, self.tgt_langs = src_langs, tgt_langs
+        self.speakers = speakers
+        self.tgt_dict = tgt_dict
+        self.check_tgt_lang_tag()
+        self.ids = ids
+        self.shuffle = cfg.shuffle if is_train_split else False
+        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
+            self.cfg.get_feature_transforms(split, is_train_split)
+        )
+        self.waveform_transforms = CompositeAudioWaveformTransform.from_config_dict(
+            self.cfg.get_waveform_transforms(split, is_train_split)
+        )
+        # TODO: add these to data_cfg.py
+        self.dataset_transforms = CompositeAudioDatasetTransform.from_config_dict(
+            self.cfg.get_dataset_transforms(split, is_train_split)
+        )
+        # check proper usage of transforms
+        if self.feature_transforms and self.cfg.use_audio_input:
+            logger.warning(
+                "Feature transforms will not be applied. To use feature transforms, "
+                "set use_audio_input as False in config."
+            )
+        self.pre_tokenizer = pre_tokenizer
+        self.bpe_tokenizer = bpe_tokenizer
+        self.n_frames_per_step = n_frames_per_step
+        self.speaker_to_id = speaker_to_id
+        self.tgt_lens = self.get_tgt_lens_and_check_oov()
+        self.append_eos = append_eos
+        logger.info(self.__repr__())
+    def get_tgt_lens_and_check_oov(self):
+        if self.tgt_texts is None:
+            return [0 for _ in range(self.n_samples)]
+        tgt_lens = []
+        n_tokens, n_oov_tokens = 0, 0
+        for i in range(self.n_samples):
+            tokenized = self.get_tokenized_tgt_text(i).split(" ")
+            oov_tokens = [
+                t
+                for t in tokenized
+                if self.tgt_dict.index(t) == self.tgt_dict.unk_index
+            ]
+            n_tokens += len(tokenized)
+            n_oov_tokens += len(oov_tokens)
+            tgt_lens.append(len(tokenized))
+        logger.info(f"'{self.split}' has {n_oov_tokens / n_tokens * 100:.2f}% OOV")
+        return tgt_lens
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + f'(split="{self.split}", n_samples={self.n_samples:_}, '
+            f"prepend_tgt_lang_tag={self.cfg.prepend_tgt_lang_tag}, "
+            f"n_frames_per_step={self.n_frames_per_step}, "
+            f"shuffle={self.shuffle}, "
+            f"feature_transforms={self.feature_transforms}, "
+            f"waveform_transforms={self.waveform_transforms}, "
+            f"dataset_transforms={self.dataset_transforms})"
+        )
+    @classmethod
+    def is_lang_tag(cls, token):
+        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
+        return re.match(pattern, token)
+    def check_tgt_lang_tag(self):
+        if self.cfg.prepend_tgt_lang_tag:
+            assert self.tgt_langs is not None and self.tgt_dict is not None
+            tgt_lang_tags = [
+                self.LANG_TAG_TEMPLATE.format(t) for t in set(self.tgt_langs)
+            ]
+            assert all(t in self.tgt_dict for t in tgt_lang_tags)
+    @classmethod
+    def tokenize(cls, tokenizer, text: str):
+        return text if tokenizer is None else tokenizer.encode(text)
+    def get_tokenized_tgt_text(self, index: Union[int, List[int]]):
+        if _is_int_or_np_int(index):
+            text = self.tgt_texts[index]
+        else:
+            text = " ".join([self.tgt_texts[i] for i in index])
+        text = self.tokenize(self.pre_tokenizer, text)
+        text = self.tokenize(self.bpe_tokenizer, text)
+        return text
+    def pack_frames(self, feature: torch.Tensor):
+        if self.n_frames_per_step == 1:
+            return feature
+        n_packed_frames = feature.shape[0] // self.n_frames_per_step
+        feature = feature[: self.n_frames_per_step * n_packed_frames]
+        return feature.reshape(n_packed_frames, -1)
+    @classmethod
+    def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
+        lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
+        assert lang_tag_idx != dictionary.unk()
+        return lang_tag_idx
+    def _get_source_audio(self, index: Union[int, List[int]]) -> torch.Tensor:
+        """
+        Gives source audio for given index with any relevant transforms
+        applied. For ConcatAug, source audios for given indices are
+        concatenated in given order.
+        Args:
+            index (int or List[int]): index—or in the case of ConcatAug,
+            indices—to pull the source audio for
+        Returns:
+            source audios concatenated for given indices with
+            relevant transforms appplied
+        """
+        if _is_int_or_np_int(index):
+            source = get_features_or_waveform(
+                self.audio_paths[index],
+                need_waveform=self.cfg.use_audio_input,
+                use_sample_rate=self.cfg.use_sample_rate,
+                waveform_transforms=self.waveform_transforms,
+            )
+        else:
+            source = np.concatenate(
+                [
+                    get_features_or_waveform(
+                        self.audio_paths[i],
+                        need_waveform=self.cfg.use_audio_input,
+                        use_sample_rate=self.cfg.use_sample_rate,
+                        waveform_transforms=self.waveform_transforms,
+                    )
+                    for i in index
+                ]
+            )
+        if self.cfg.use_audio_input:
+            source = torch.from_numpy(source).float()
+            if self.cfg.standardize_audio:
+                with torch.no_grad():
+                    source = F.layer_norm(source, source.shape)
+        else:
+            if self.feature_transforms is not None:
+                source = self.feature_transforms(source)
+            source = torch.from_numpy(source).float()
+        return source
+    def __getitem__(self, index: int) -> SpeechToTextDatasetItem:
+        has_concat = self.dataset_transforms.has_transform(ConcatAugment)
+        if has_concat:
+            concat = self.dataset_transforms.get_transform(ConcatAugment)
+            indices = concat.find_indices(index, self.n_frames, self.n_samples)
+        source = self._get_source_audio(indices if has_concat else index)
+        source = self.pack_frames(source)
+        target = None
+        if self.tgt_texts is not None:
+            tokenized = self.get_tokenized_tgt_text(indices if has_concat else index)
+            target = self.tgt_dict.encode_line(
+                tokenized, add_if_not_exist=False, append_eos=self.append_eos
+            ).long()
+            if self.cfg.prepend_tgt_lang_tag:
+                lang_tag_idx = self.get_lang_tag_idx(
+                    self.tgt_langs[index], self.tgt_dict
+                )
+                target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0)
+        if self.cfg.prepend_bos_and_append_tgt_lang_tag:
+            bos = torch.LongTensor([self.tgt_dict.bos()])
+            lang_tag_idx = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+            assert lang_tag_idx != self.tgt_dict.unk()
+            lang_tag_idx = torch.LongTensor([lang_tag_idx])
+            target = torch.cat((bos, target, lang_tag_idx), 0)
+        speaker_id = None
+        if self.speaker_to_id is not None:
+            speaker_id = self.speaker_to_id[self.speakers[index]]
+        return SpeechToTextDatasetItem(
+            index=index, source=source, target=target, speaker_id=speaker_id
+        )
+    def __len__(self):
+        return self.n_samples
+    def collater(
+        self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        indices = torch.tensor([x.index for x in samples], dtype=torch.long)
+        sources = [x.source for x in samples]
+        has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment)
+        if has_NOAug and self.cfg.use_audio_input:
+            NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment)
+            sources = NOAug(sources)
+        frames = _collate_frames(sources, self.cfg.use_audio_input)
+        # sort samples by descending number of frames
+        n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long)
+        n_frames, order = n_frames.sort(descending=True)
+        indices = indices.index_select(0, order)
+        frames = frames.index_select(0, order)
+        target, target_lengths = None, None
+        prev_output_tokens = None
+        ntokens = None
+        if self.tgt_texts is not None:
+            target = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, order)
+            target_lengths = torch.tensor(
+                [x.target.size(0) for x in samples], dtype=torch.long
+            ).index_select(0, order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                eos_idx=None,
+                left_pad=False,
+                move_eos_to_beginning=True,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, order)
+            ntokens = sum(x.target.size(0) for x in samples)
+        speaker = None
+        if self.speaker_to_id is not None:
+            speaker = (
+                torch.tensor([s.speaker_id for s in samples], dtype=torch.long)
+                .index_select(0, order)
+                .view(-1, 1)
+            )
+        net_input = {
+            "src_tokens": frames,
+            "src_lengths": n_frames,
+            "prev_output_tokens": prev_output_tokens,
+        }
+        out = {
+            "id": indices,
+            "net_input": net_input,
+            "speaker": speaker,
+            "target": target,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+            "nsentences": len(samples),
+        }
+        if return_order:
+            out["order"] = order
+        return out
+    def num_tokens(self, index):
+        return self.n_frames[index]
+    def size(self, index):
+        return self.n_frames[index], self.tgt_lens[index]
+    @property
+    def sizes(self):
+        return np.array(self.n_frames)
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        # first by descending order of # of frames then by original/random order
+        order.append([-n for n in self.n_frames])
+        return np.lexsort(order)
+    def prefetch(self, indices):
+        raise False
+class TextTargetMultitaskData(object):
+    # mandatory columns
+    KEY_ID, KEY_TEXT = "id", "tgt_text"
+    LANG_TAG_TEMPLATE = "<lang:{}>"
+    def __init__(self, args, split, tgt_dict):
+        samples = SpeechToTextDatasetCreator._load_samples_from_tsv(args.data, split)
+        self.data = {s[self.KEY_ID]: s[self.KEY_TEXT] for s in samples}
+        self.dict = tgt_dict
+        self.append_eos = args.decoder_type != "ctc"
+        self.pre_tokenizer = self.build_tokenizer(args)
+        self.bpe_tokenizer = self.build_bpe(args)
+        self.prepend_bos_and_append_tgt_lang_tag = (
+            args.prepend_bos_and_append_tgt_lang_tag
+        )
+        self.eos_token = args.eos_token
+        self.lang_tag_mapping = args.get_lang_tag_mapping
+    @classmethod
+    def is_lang_tag(cls, token):
+        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
+        return re.match(pattern, token)
+    @classmethod
+    def tokenize(cls, tokenizer, text: str):
+        return text if tokenizer is None else tokenizer.encode(text)
+    def get_tokenized_tgt_text(self, index: int):
+        text = self.tokenize(self.pre_tokenizer, self.data[index])
+        text = self.tokenize(self.bpe_tokenizer, text)
+        return text
+    def get_lang_tag_idx(self, lang: str, dictionary: Dictionary):
+        lang_tag = self.LANG_TAG_TEMPLATE.format(lang)
+        lang_tag = self.lang_tag_mapping.get(lang_tag, lang_tag)
+        lang_tag_idx = dictionary.index(lang_tag)
+        assert lang_tag_idx != dictionary.unk(), (lang, lang_tag)
+        return lang_tag_idx
+    def build_tokenizer(self, args):
+        pre_tokenizer = args.config.get("pre_tokenizer")
+        if pre_tokenizer is not None:
+            logger.info(f"pre-tokenizer: {pre_tokenizer}")
+            return encoders.build_tokenizer(Namespace(**pre_tokenizer))
+        else:
+            return None
+    def build_bpe(self, args):
+        bpe_tokenizer = args.config.get("bpe_tokenizer")
+        if bpe_tokenizer is not None:
+            logger.info(f"tokenizer: {bpe_tokenizer}")
+            return encoders.build_bpe(Namespace(**bpe_tokenizer))
+        else:
+            return None
+    def get(self, sample_id, tgt_lang=None):
+        if sample_id in self.data:
+            tokenized = self.get_tokenized_tgt_text(sample_id)
+            target = self.dict.encode_line(
+                tokenized,
+                add_if_not_exist=False,
+                append_eos=self.append_eos,
+            )
+            if self.prepend_bos_and_append_tgt_lang_tag:
+                bos = torch.LongTensor([self.dict.bos()])
+                lang_tag_idx = self.get_lang_tag_idx(tgt_lang, self.dict)
+                assert lang_tag_idx != self.dict.unk()
+                lang_tag_idx = torch.LongTensor([lang_tag_idx])
+                target = torch.cat((bos, target, lang_tag_idx), 0)
+            return target
+        else:
+            logger.warning(f"no target for {sample_id}")
+            return torch.IntTensor([])
+    def collater(self, samples: List[torch.Tensor]) -> torch.Tensor:
+        out = fairseq_data_utils.collate_tokens(
+            samples,
+            self.dict.pad(),
+            eos_idx=None,
+            left_pad=False,
+            move_eos_to_beginning=False,
+        ).long()
+        prev_out = fairseq_data_utils.collate_tokens(
+            samples,
+            self.dict.pad(),
+            eos_idx=None,
+            left_pad=False,
+            move_eos_to_beginning=True,
+        ).long()
+        target_lengths = torch.tensor([t.size(0) for t in samples], dtype=torch.long)
+        ntokens = sum(t.size(0) for t in samples)
+        output = {
+            "prev_output_tokens": prev_out,
+            "target": out,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+        }
+        return output
+class SpeechToTextMultitaskDataset(SpeechToTextDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.multitask_data = {}
+    def add_multitask_dataset(self, task_name, task_data):
+        self.multitask_data[task_name] = task_data
+    def __getitem__(
+        self, index: int
+    ) -> Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]:
+        s2t_data = super().__getitem__(index)
+        multitask_target = {}
+        sample_id = self.ids[index]
+        tgt_lang = self.tgt_langs[index]
+        for task_name, task_dataset in self.multitask_data.items():
+            multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang)
+        return s2t_data, multitask_target
+    def collater(
+        self, samples: List[Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]]
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        out = super().collater([s for s, _ in samples], return_order=True)
+        order = out["order"]
+        del out["order"]
+        for task_name, task_dataset in self.multitask_data.items():
+            if "multitask" not in out:
+                out["multitask"] = {}
+            d = [s[task_name] for _, s in samples]
+            task_target = task_dataset.collater(d)
+            out["multitask"][task_name] = {
+                "target": task_target["target"].index_select(0, order),
+                "target_lengths": task_target["target_lengths"].index_select(0, order),
+                "ntokens": task_target["ntokens"],
+            }
+            out["multitask"][task_name]["net_input"] = {
+                "prev_output_tokens": task_target["prev_output_tokens"].index_select(
+                    0, order
+                ),
+            }
+        return out
+class SpeechToTextDatasetCreator(object):
+    # mandatory columns
+    KEY_ID, KEY_AUDIO, KEY_N_FRAMES = "id", "audio", "n_frames"
+    KEY_TGT_TEXT = "tgt_text"
+    # optional columns
+    KEY_SPEAKER, KEY_SRC_TEXT = "speaker", "src_text"
+    KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang"
+    # default values
+    DEFAULT_SPEAKER = DEFAULT_SRC_TEXT = DEFAULT_LANG = ""
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TDataConfig,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        has_multitask = multitask is not None and len(multitask.keys()) > 0
+        dataset_cls = (
+            SpeechToTextMultitaskDataset if has_multitask else SpeechToTextDataset
+        )
+        ds = dataset_cls(
+            split=split_name,
+            is_train_split=is_train_split,
+            cfg=cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+        if has_multitask:
+            for task_name, task_obj in multitask.items():
+                task_data = TextTargetMultitaskData(
+                    task_obj.args, split_name, task_obj.target_dictionary
+                )
+                ds.add_multitask_dataset(task_name, task_data)
+        return ds
+    @classmethod
+    def get_size_ratios(
+        cls, datasets: List[SpeechToTextDataset], alpha: float = 1.0
+    ) -> List[float]:
+        """Size ratios for temperature-based sampling
+        (https://arxiv.org/abs/1907.05019)"""
+        id_to_lp, lp_to_sz = {}, defaultdict(int)
+        for ds in datasets:
+            lang_pairs = {f"{s}->{t}" for s, t in zip(ds.src_langs, ds.tgt_langs)}
+            assert len(lang_pairs) == 1
+            lang_pair = list(lang_pairs)[0]
+            id_to_lp[ds.split] = lang_pair
+            lp_to_sz[lang_pair] += sum(ds.n_frames)
+        sz_sum = sum(v for v in lp_to_sz.values())
+        lp_to_prob = {k: v / sz_sum for k, v in lp_to_sz.items()}
+        lp_to_tgt_prob = {k: v**alpha for k, v in lp_to_prob.items()}
+        prob_sum = sum(v for v in lp_to_tgt_prob.values())
+        lp_to_tgt_prob = {k: v / prob_sum for k, v in lp_to_tgt_prob.items()}
+        lp_to_sz_ratio = {
+            k: (lp_to_tgt_prob[k] * sz_sum) / v for k, v in lp_to_sz.items()
+        }
+        size_ratio = [lp_to_sz_ratio[id_to_lp[ds.split]] for ds in datasets]
+        p_formatted = {
+            k: f"{lp_to_prob[k]:.3f}->{lp_to_tgt_prob[k]:.3f}" for k in lp_to_sz
+        }
+        logger.info(f"sampling probability balancing: {p_formatted}")
+        sr_formatted = {ds.split: f"{r:.3f}" for ds, r in zip(datasets, size_ratio)}
+        logger.info(f"balanced sampling size ratio: {sr_formatted}")
+        return size_ratio
+    @classmethod
+    def _load_samples_from_tsv(cls, root: str, split: str):
+        tsv_path = Path(root) / f"{split}.tsv"
+        if not tsv_path.is_file():
+            raise FileNotFoundError(f"Dataset not found: {tsv_path}")
+        with open(tsv_path) as f:
+            reader = csv.DictReader(
+                f,
+                delimiter="\t",
+                quotechar=None,
+                doublequote=False,
+                lineterminator="\n",
+                quoting=csv.QUOTE_NONE,
+            )
+            samples = [dict(e) for e in reader]
+        if len(samples) == 0:
+            raise ValueError(f"Empty manifest: {tsv_path}")
+        return samples
+    @classmethod
+    def _from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        split: str,
+        tgt_dict,
+        is_train_split: bool,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        samples = cls._load_samples_from_tsv(root, split)
+        return cls._from_list(
+            split,
+            is_train_split,
+            samples,
+            cfg,
+            tgt_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            n_frames_per_step,
+            speaker_to_id,
+            multitask,
+        )
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        splits: str,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        n_frames_per_step: int = 1,
+        speaker_to_id=None,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        datasets = [
+            cls._from_tsv(
+                root=root,
+                cfg=cfg,
+                split=split,
+                tgt_dict=tgt_dict,
+                is_train_split=is_train_split,
+                pre_tokenizer=pre_tokenizer,
+                bpe_tokenizer=bpe_tokenizer,
+                n_frames_per_step=n_frames_per_step,
+                speaker_to_id=speaker_to_id,
+                multitask=multitask,
+            )
+            for split in splits.split(",")
+        ]
+        if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
+            # temperature-based sampling
+            size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
+            datasets = [
+                ResamplingDataset(
+                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
+                )
+                for r, d in zip(size_ratios, datasets)
+            ]
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]

modules/voice_conversion/fairseq/data/audio/speech_to_text_joint_dataset.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from pathlib import Path
+from typing import Dict, List, NamedTuple, Optional
+import torch
+from fairseq.data import ConcatDataset, Dictionary, ResamplingDataset
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.audio.speech_to_text_dataset import (
+    S2TDataConfig,
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+)
+logger = logging.getLogger(__name__)
+class S2TJointDataConfig(S2TDataConfig):
+    """Wrapper class for data config YAML"""
+    @property
+    def src_vocab_filename(self):
+        """fairseq vocabulary file under data root"""
+        return self.config.get("src_vocab_filename", "src_dict.txt")
+    @property
+    def src_pre_tokenizer(self) -> Dict:
+        """Pre-tokenizer to apply before subword tokenization. Returning
+        a dictionary with `tokenizer` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        return self.config.get("src_pre_tokenizer", {"tokenizer": None})
+    @property
+    def src_bpe_tokenizer(self) -> Dict:
+        """Subword tokenizer to apply on source text after pre-tokenization.
+        Returning a dictionary with `bpe` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        return self.config.get("src_bpe_tokenizer", {"bpe": None})
+    @property
+    def prepend_tgt_lang_tag_no_change(self) -> bool:
+        """Prepend target lang ID token as the prev_output_tokens BOS (e.g. for
+        to-many multilingual setting). No change needed during inference.
+        This option is deprecated and replaced by prepend_tgt_lang_tag_as_bos.
+        """
+        value = self.config.get("prepend_tgt_lang_tag_no_change", None)
+        if value is None:
+            return self.config.get("prepend_tgt_lang_tag_as_bos", False)
+        return value
+    @property
+    def sampling_text_alpha(self):
+        """Hyper-parameter alpha = 1/T for temperature-based resampling. (text
+        input only) (alpha = 1 for no resampling)"""
+        return self.config.get("sampling_text_alpha", 1.0)
+class SpeechToTextJointDatasetItem(NamedTuple):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    src_txt_tokens: Optional[torch.Tensor] = None
+    tgt_lang_tag: Optional[int] = None
+    src_lang_tag: Optional[int] = None
+    tgt_alignment: Optional[torch.Tensor] = None
+# use_src_lang_id:
+#   0: don't use src_lang_id
+#   1: attach src_lang_id to the src_txt_tokens as eos
+class SpeechToTextJointDataset(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TJointDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        src_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        src_pre_tokenizer=None,
+        src_bpe_tokenizer=None,
+        append_eos: Optional[bool] = True,
+        alignment: Optional[List[str]] = None,
+        use_src_lang_id: Optional[int] = 0,
+    ):
+        super().__init__(
+            split,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            append_eos=append_eos,
+        )
+        self.src_dict = src_dict
+        self.src_pre_tokenizer = src_pre_tokenizer
+        self.src_bpe_tokenizer = src_bpe_tokenizer
+        self.alignment = None
+        self.use_src_lang_id = use_src_lang_id
+        if alignment is not None:
+            self.alignment = [
+                [float(s) for s in sample.split()] for sample in alignment
+            ]
+    def get_tokenized_src_text(self, index: int):
+        text = self.tokenize(self.src_pre_tokenizer, self.src_texts[index])
+        text = self.tokenize(self.src_bpe_tokenizer, text)
+        return text
+    def __getitem__(self, index: int) -> SpeechToTextJointDatasetItem:
+        s2t_dataset_item = super().__getitem__(index)
+        src_tokens = None
+        src_lang_tag = None
+        if self.src_texts is not None and self.src_dict is not None:
+            src_tokens = self.get_tokenized_src_text(index)
+            src_tokens = self.src_dict.encode_line(
+                src_tokens, add_if_not_exist=False, append_eos=True
+            ).long()
+            if self.use_src_lang_id > 0:
+                src_lang_tag = self.get_lang_tag_idx(
+                    self.src_langs[index], self.src_dict
+                )
+        tgt_lang_tag = None
+        if self.cfg.prepend_tgt_lang_tag_no_change:
+            # prepend_tgt_lang_tag_no_change: modify prev_output_tokens instead
+            tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+        ali = None
+        if self.alignment is not None:
+            ali = torch.Tensor(self.alignment[index]).float()
+        return SpeechToTextJointDatasetItem(
+            index=index,
+            source=s2t_dataset_item.source,
+            target=s2t_dataset_item.target,
+            src_txt_tokens=src_tokens,
+            tgt_lang_tag=tgt_lang_tag,
+            src_lang_tag=src_lang_tag,
+            tgt_alignment=ali,
+        )
+    def __len__(self):
+        return self.n_samples
+    def collater(self, samples: List[SpeechToTextJointDatasetItem]) -> Dict:
+        s2t_out = super().collater(samples, return_order=True)
+        if s2t_out == {}:
+            return s2t_out
+        net_input, order = s2t_out["net_input"], s2t_out["order"]
+        if self.src_texts is not None and self.src_dict is not None:
+            src_txt_tokens = fairseq_data_utils.collate_tokens(
+                [x.src_txt_tokens for x in samples],
+                self.src_dict.pad(),
+                self.src_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            src_txt_lengths = torch.tensor(
+                [x.src_txt_tokens.size()[0] for x in samples], dtype=torch.long
+            )
+            if self.use_src_lang_id > 0:
+                src_lang_idxs = torch.tensor(
+                    [s.src_lang_tag for s in samples], dtype=src_txt_tokens.dtype
+                )
+                if self.use_src_lang_id == 1:  # replace eos with lang_id
+                    eos_idx = src_txt_lengths - 1
+                    src_txt_tokens.scatter_(
+                        1, eos_idx.view(-1, 1), src_lang_idxs.view(-1, 1)
+                    )
+                else:
+                    raise NotImplementedError("Implementation is required")
+            src_txt_tokens = src_txt_tokens.index_select(0, order)
+            src_txt_lengths = src_txt_lengths.index_select(0, order)
+            net_input["src_txt_tokens"] = src_txt_tokens
+            net_input["src_txt_lengths"] = src_txt_lengths
+        net_input["alignment"] = None
+        if self.alignment is not None:
+            max_len = max([s.tgt_alignment.size(0) for s in samples])
+            alignment = torch.ones(len(samples), max_len).float()
+            for i, s in enumerate(samples):
+                cur_len = s.tgt_alignment.size(0)
+                alignment[i][:cur_len].copy_(s.tgt_alignment)
+            net_input["alignment"] = alignment.index_select(0, order)
+        if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None:
+            for i in range(len(samples)):
+                net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag
+        out = {
+            "id": s2t_out["id"],
+            "net_input": net_input,
+            "target": s2t_out["target"],
+            "target_lengths": s2t_out["target_lengths"],
+            "ntokens": s2t_out["ntokens"],
+            "nsentences": len(samples),
+        }
+        return out
+class SpeechToTextJointDatasetCreator(SpeechToTextDatasetCreator):
+    KEY_ALIGN = "align"
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TJointDataConfig,
+        tgt_dict,
+        src_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        src_pre_tokenizer,
+        src_bpe_tokenizer,
+        append_eos,
+        use_src_lang_id,
+    ) -> SpeechToTextJointDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_alignment = None
+        if cls.KEY_ALIGN in samples[0].keys():
+            tgt_alignment = [s[cls.KEY_ALIGN] for s in samples]
+        return SpeechToTextJointDataset(
+            split_name,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            src_dict=src_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            src_pre_tokenizer=src_pre_tokenizer,
+            src_bpe_tokenizer=src_bpe_tokenizer,
+            append_eos=append_eos,
+            alignment=tgt_alignment,
+            use_src_lang_id=use_src_lang_id,
+        )
+    @classmethod
+    def _from_tsv(
+        cls,
+        root: str,
+        cfg: S2TJointDataConfig,
+        split: str,
+        tgt_dict,
+        src_dict,
+        is_train_split: bool,
+        pre_tokenizer,
+        bpe_tokenizer,
+        src_pre_tokenizer,
+        src_bpe_tokenizer,
+        append_eos: bool,
+        use_src_lang_id: int,
+    ) -> SpeechToTextJointDataset:
+        samples = cls._load_samples_from_tsv(root, split)
+        return cls._from_list(
+            split,
+            is_train_split,
+            samples,
+            cfg,
+            tgt_dict,
+            src_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            src_pre_tokenizer,
+            src_bpe_tokenizer,
+            append_eos,
+            use_src_lang_id,
+        )
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        cfg: S2TJointDataConfig,
+        splits: str,
+        tgt_dict,
+        src_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        src_pre_tokenizer,
+        src_bpe_tokenizer,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        append_eos: Optional[bool] = True,
+        use_src_lang_id: Optional[int] = 0,
+    ) -> SpeechToTextJointDataset:
+        datasets = [
+            cls._from_tsv(
+                root,
+                cfg,
+                split,
+                tgt_dict,
+                src_dict,
+                is_train_split,
+                pre_tokenizer,
+                bpe_tokenizer,
+                src_pre_tokenizer,
+                src_bpe_tokenizer,
+                append_eos=append_eos,
+                use_src_lang_id=use_src_lang_id,
+            )
+            for split in splits.split(",")
+        ]
+        if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
+            # temperature-based sampling
+            size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
+            datasets = [
+                ResamplingDataset(
+                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
+                )
+                for r, d in zip(size_ratios, datasets)
+            ]
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]

modules/voice_conversion/fairseq/data/audio/text_to_speech_dataset.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.abs
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import numpy as np
+import torch
+from fairseq.data import Dictionary
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.speech_to_text_dataset import (
+    S2TDataConfig,
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+    _collate_frames,
+)
+@dataclass
+class TextToSpeechDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    speaker_id: Optional[int] = None
+    duration: Optional[torch.Tensor] = None
+    pitch: Optional[torch.Tensor] = None
+    energy: Optional[torch.Tensor] = None
+class TextToSpeechDataset(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        durations: Optional[List[List[int]]] = None,
+        pitches: Optional[List[str]] = None,
+        energies: Optional[List[str]] = None,
+    ):
+        super(TextToSpeechDataset, self).__init__(
+            split,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+        self.durations = durations
+        self.pitches = pitches
+        self.energies = energies
+    def __getitem__(self, index: int) -> TextToSpeechDatasetItem:
+        s2t_item = super().__getitem__(index)
+        duration, pitch, energy = None, None, None
+        if self.durations is not None:
+            duration = torch.tensor(
+                self.durations[index] + [0], dtype=torch.long  # pad 0 for EOS
+            )
+        if self.pitches is not None:
+            pitch = get_features_or_waveform(self.pitches[index])
+            pitch = torch.from_numpy(
+                np.concatenate((pitch, [0]))  # pad 0 for EOS
+            ).float()
+        if self.energies is not None:
+            energy = get_features_or_waveform(self.energies[index])
+            energy = torch.from_numpy(
+                np.concatenate((energy, [0]))  # pad 0 for EOS
+            ).float()
+        return TextToSpeechDatasetItem(
+            index=index,
+            source=s2t_item.source,
+            target=s2t_item.target,
+            speaker_id=s2t_item.speaker_id,
+            duration=duration,
+            pitch=pitch,
+            energy=energy,
+        )
+    def collater(self, samples: List[TextToSpeechDatasetItem]) -> Dict[str, Any]:
+        if len(samples) == 0:
+            return {}
+        src_lengths, order = torch.tensor(
+            [s.target.shape[0] for s in samples], dtype=torch.long
+        ).sort(descending=True)
+        id_ = torch.tensor([s.index for s in samples], dtype=torch.long).index_select(
+            0, order
+        )
+        feat = _collate_frames(
+            [s.source for s in samples], self.cfg.use_audio_input
+        ).index_select(0, order)
+        target_lengths = torch.tensor(
+            [s.source.shape[0] for s in samples], dtype=torch.long
+        ).index_select(0, order)
+        src_tokens = fairseq_data_utils.collate_tokens(
+            [s.target for s in samples],
+            self.tgt_dict.pad(),
+            self.tgt_dict.eos(),
+            left_pad=False,
+            move_eos_to_beginning=False,
+        ).index_select(0, order)
+        speaker = None
+        if self.speaker_to_id is not None:
+            speaker = (
+                torch.tensor([s.speaker_id for s in samples], dtype=torch.long)
+                .index_select(0, order)
+                .view(-1, 1)
+            )
+        bsz, _, d = feat.size()
+        prev_output_tokens = torch.cat(
+            (feat.new_zeros((bsz, 1, d)), feat[:, :-1, :]), dim=1
+        )
+        durations, pitches, energies = None, None, None
+        if self.durations is not None:
+            durations = fairseq_data_utils.collate_tokens(
+                [s.duration for s in samples], 0
+            ).index_select(0, order)
+            assert src_tokens.shape[1] == durations.shape[1]
+        if self.pitches is not None:
+            pitches = _collate_frames([s.pitch for s in samples], True)
+            pitches = pitches.index_select(0, order)
+            assert src_tokens.shape[1] == pitches.shape[1]
+        if self.energies is not None:
+            energies = _collate_frames([s.energy for s in samples], True)
+            energies = energies.index_select(0, order)
+            assert src_tokens.shape[1] == energies.shape[1]
+        src_texts = [self.tgt_dict.string(samples[i].target) for i in order]
+        return {
+            "id": id_,
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "prev_output_tokens": prev_output_tokens,
+            },
+            "speaker": speaker,
+            "target": feat,
+            "durations": durations,
+            "pitches": pitches,
+            "energies": energies,
+            "target_lengths": target_lengths,
+            "ntokens": sum(target_lengths).item(),
+            "nsentences": len(samples),
+            "src_texts": src_texts,
+        }
+class TextToSpeechDatasetCreator(SpeechToTextDatasetCreator):
+    KEY_DURATION = "duration"
+    KEY_PITCH = "pitch"
+    KEY_ENERGY = "energy"
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TDataConfig,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask=None,
+    ) -> TextToSpeechDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        durations = [s.get(cls.KEY_DURATION, None) for s in samples]
+        durations = [
+            None if dd is None else [int(d) for d in dd.split(" ")] for dd in durations
+        ]
+        durations = None if any(dd is None for dd in durations) else durations
+        pitches = [s.get(cls.KEY_PITCH, None) for s in samples]
+        pitches = [
+            None if pp is None else (audio_root / pp).as_posix() for pp in pitches
+        ]
+        pitches = None if any(pp is None for pp in pitches) else pitches
+        energies = [s.get(cls.KEY_ENERGY, None) for s in samples]
+        energies = [
+            None if ee is None else (audio_root / ee).as_posix() for ee in energies
+        ]
+        energies = None if any(ee is None for ee in energies) else energies
+        return TextToSpeechDataset(
+            split_name,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts,
+            tgt_texts,
+            speakers,
+            src_langs,
+            tgt_langs,
+            ids,
+            tgt_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            n_frames_per_step,
+            speaker_to_id,
+            durations,
+            pitches,
+            energies,
+        )

modules/voice_conversion/fairseq/data/audio/waveform_transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+from fairseq.data.audio import (
+    AudioTransform,
+    CompositeAudioTransform,
+    import_transforms,
+    register_audio_transform,
+)
+class AudioWaveformTransform(AudioTransform):
+    pass
+AUDIO_WAVEFORM_TRANSFORM_REGISTRY = {}
+AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES = set()
+def get_audio_waveform_transform(name):
+    return AUDIO_WAVEFORM_TRANSFORM_REGISTRY[name]
+def register_audio_waveform_transform(name):
+    return register_audio_transform(
+        name,
+        AudioWaveformTransform,
+        AUDIO_WAVEFORM_TRANSFORM_REGISTRY,
+        AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES,
+    )
+import_transforms(os.path.dirname(__file__), "waveform")
+class CompositeAudioWaveformTransform(CompositeAudioTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        return super()._from_config_dict(
+            cls,
+            "waveform",
+            get_audio_waveform_transform,
+            CompositeAudioWaveformTransform,
+            config,
+        )
+    def __call__(self, x, sample_rate):
+        for t in self.transforms:
+            x, sample_rate = t(x, sample_rate)
+        return x, sample_rate

modules/voice_conversion/fairseq/data/audio/waveform_transforms/noiseaugment.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from pathlib import Path
+import numpy as np
+from math import ceil
+from fairseq.data.audio import rand_uniform
+from fairseq.data.audio.waveform_transforms import (
+    AudioWaveformTransform,
+    register_audio_waveform_transform,
+)
+SNR_MIN = 5.0
+SNR_MAX = 15.0
+RATE = 0.25
+NOISE_RATE = 1.0
+NOISE_LEN_MEAN = 0.2
+NOISE_LEN_STD = 0.05
+class NoiseAugmentTransform(AudioWaveformTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return cls(
+            _config.get("samples_path", None),
+            _config.get("snr_min", SNR_MIN),
+            _config.get("snr_max", SNR_MAX),
+            _config.get("rate", RATE),
+        )
+    def __init__(
+        self,
+        samples_path: str,
+        snr_min: float = SNR_MIN,
+        snr_max: float = SNR_MAX,
+        rate: float = RATE,
+    ):
+        # Sanity checks
+        assert (
+            samples_path
+        ), "need to provide path to audio samples for noise augmentation"
+        assert snr_max >= snr_min, f"empty signal-to-noise range ({snr_min}, {snr_max})"
+        assert rate >= 0 and rate <= 1, "rate should be a float between 0 to 1"
+        self.paths = list(Path(samples_path).glob("**/*.wav"))  # load music
+        self.n_samples = len(self.paths)
+        assert self.n_samples > 0, f"no audio files found in {samples_path}"
+        self.snr_min = snr_min
+        self.snr_max = snr_max
+        self.rate = rate
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"n_samples={self.n_samples}",
+                    f"snr={self.snr_min}-{self.snr_max}dB",
+                    f"rate={self.rate}",
+                ]
+            )
+            + ")"
+        )
+    def pick_sample(self, goal_shape, always_2d=False, use_sample_rate=None):
+        from fairseq.data.audio.audio_utils import get_waveform
+        path = self.paths[np.random.randint(0, self.n_samples)]
+        sample = get_waveform(
+            path, always_2d=always_2d, output_sample_rate=use_sample_rate
+        )[0]
+        # Check dimensions match, else silently skip adding noise to sample
+        # NOTE: SHOULD THIS QUIT WITH AN ERROR?
+        is_2d = len(goal_shape) == 2
+        if len(goal_shape) != sample.ndim or (
+            is_2d and goal_shape[0] != sample.shape[0]
+        ):
+            return np.zeros(goal_shape)
+        # Cut/repeat sample to size
+        len_dim = len(goal_shape) - 1
+        n_repeat = ceil(goal_shape[len_dim] / sample.shape[len_dim])
+        repeated = np.tile(sample, [1, n_repeat] if is_2d else n_repeat)
+        start = np.random.randint(0, repeated.shape[len_dim] - goal_shape[len_dim] + 1)
+        return (
+            repeated[:, start : start + goal_shape[len_dim]]
+            if is_2d
+            else repeated[start : start + goal_shape[len_dim]]
+        )
+    def _mix(self, source, noise, snr):
+        get_power = lambda x: np.mean(x**2)
+        if get_power(noise):
+            scl = np.sqrt(
+                get_power(source) / (np.power(10, snr / 10) * get_power(noise))
+            )
+        else:
+            scl = 0
+        return 1 * source + scl * noise
+    def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None):
+        return self.pick_sample(goal_shape, always_2d, use_sample_rate)
+    def __call__(self, source, sample_rate):
+        if np.random.random() > self.rate:
+            return source, sample_rate
+        noise = self._get_noise(
+            source.shape, always_2d=True, use_sample_rate=sample_rate
+        )
+        return (
+            self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)),
+            sample_rate,
+        )
+@register_audio_waveform_transform("musicaugment")
+class MusicAugmentTransform(NoiseAugmentTransform):
+    pass
+@register_audio_waveform_transform("backgroundnoiseaugment")
+class BackgroundNoiseAugmentTransform(NoiseAugmentTransform):
+    pass
+@register_audio_waveform_transform("babbleaugment")
+class BabbleAugmentTransform(NoiseAugmentTransform):
+    def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None):
+        for i in range(np.random.randint(3, 8)):
+            speech = self.pick_sample(goal_shape, always_2d, use_sample_rate)
+            if i == 0:
+                agg_noise = speech
+            else:  # SNR scaled by i (how many noise signals already in agg_noise)
+                agg_noise = self._mix(agg_noise, speech, i)
+        return agg_noise
+@register_audio_waveform_transform("sporadicnoiseaugment")
+class SporadicNoiseAugmentTransform(NoiseAugmentTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return cls(
+            _config.get("samples_path", None),
+            _config.get("snr_min", SNR_MIN),
+            _config.get("snr_max", SNR_MAX),
+            _config.get("rate", RATE),
+            _config.get("noise_rate", NOISE_RATE),
+            _config.get("noise_len_mean", NOISE_LEN_MEAN),
+            _config.get("noise_len_std", NOISE_LEN_STD),
+        )
+    def __init__(
+        self,
+        samples_path: str,
+        snr_min: float = SNR_MIN,
+        snr_max: float = SNR_MAX,
+        rate: float = RATE,
+        noise_rate: float = NOISE_RATE,  # noises per second
+        noise_len_mean: float = NOISE_LEN_MEAN,  # length of noises in seconds
+        noise_len_std: float = NOISE_LEN_STD,
+    ):
+        super().__init__(samples_path, snr_min, snr_max, rate)
+        self.noise_rate = noise_rate
+        self.noise_len_mean = noise_len_mean
+        self.noise_len_std = noise_len_std
+    def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None):
+        agg_noise = np.zeros(goal_shape)
+        len_dim = len(goal_shape) - 1
+        is_2d = len(goal_shape) == 2
+        n_noises = round(self.noise_rate * goal_shape[len_dim] / use_sample_rate)
+        start_pointers = [
+            round(rand_uniform(0, goal_shape[len_dim])) for _ in range(n_noises)
+        ]
+        for start_pointer in start_pointers:
+            noise_shape = list(goal_shape)
+            len_seconds = np.random.normal(self.noise_len_mean, self.noise_len_std)
+            noise_shape[len_dim] = round(max(0, len_seconds) * use_sample_rate)
+            end_pointer = start_pointer + noise_shape[len_dim]
+            if end_pointer >= goal_shape[len_dim]:
+                continue
+            noise = self.pick_sample(noise_shape, always_2d, use_sample_rate)
+            if is_2d:
+                agg_noise[:, start_pointer:end_pointer] = (
+                    agg_noise[:, start_pointer:end_pointer] + noise
+                )
+            else:
+                agg_noise[start_pointer:end_pointer] = (
+                    agg_noise[start_pointer:end_pointer] + noise
+                )
+        return agg_noise

modules/voice_conversion/fairseq/data/backtranslation_dataset.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from fairseq import utils
+from . import FairseqDataset
+def backtranslate_samples(samples, collate_fn, generate_fn, cuda=True):
+    """Backtranslate a list of samples.
+    Given an input (*samples*) of the form:
+        [{'id': 1, 'source': 'hallo welt'}]
+    this will return:
+        [{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}]
+    Args:
+        samples (List[dict]): samples to backtranslate. Individual samples are
+            expected to have a 'source' key, which will become the 'target'
+            after backtranslation.
+        collate_fn (callable): function to collate samples into a mini-batch
+        generate_fn (callable): function to generate backtranslations
+        cuda (bool): use GPU for generation (default: ``True``)
+    Returns:
+        List[dict]: an updated list of samples with a backtranslated source
+    """
+    collated_samples = collate_fn(samples)
+    s = utils.move_to_cuda(collated_samples) if cuda else collated_samples
+    generated_sources = generate_fn(s)
+    id_to_src = {sample["id"]: sample["source"] for sample in samples}
+    # Go through each tgt sentence in batch and its corresponding best
+    # generated hypothesis and create a backtranslation data pair
+    # {id: id, source: generated backtranslation, target: original tgt}
+    return [
+        {
+            "id": id.item(),
+            "target": id_to_src[id.item()],
+            "source": hypos[0]["tokens"].cpu(),
+        }
+        for id, hypos in zip(collated_samples["id"], generated_sources)
+    ]
+class BacktranslationDataset(FairseqDataset):
+    """
+    Sets up a backtranslation dataset which takes a tgt batch, generates
+    a src using a tgt-src backtranslation function (*backtranslation_fn*),
+    and returns the corresponding `{generated src, input tgt}` batch.
+    Args:
+        tgt_dataset (~fairseq.data.FairseqDataset): the dataset to be
+            backtranslated. Only the source side of this dataset will be used.
+            After backtranslation, the source sentences in this dataset will be
+            returned as the targets.
+        src_dict (~fairseq.data.Dictionary): the dictionary of backtranslated
+            sentences.
+        tgt_dict (~fairseq.data.Dictionary, optional): the dictionary of
+            sentences to be backtranslated.
+        backtranslation_fn (callable, optional): function to call to generate
+            backtranslations. This is typically the `generate` method of a
+            :class:`~fairseq.sequence_generator.SequenceGenerator` object.
+            Pass in None when it is not available at initialization time, and
+            use set_backtranslation_fn function to set it when available.
+        output_collater (callable, optional): function to call on the
+            backtranslated samples to create the final batch
+            (default: ``tgt_dataset.collater``).
+        cuda: use GPU for generation
+    """
+    def __init__(
+        self,
+        tgt_dataset,
+        src_dict,
+        tgt_dict=None,
+        backtranslation_fn=None,
+        output_collater=None,
+        cuda=True,
+        **kwargs
+    ):
+        self.tgt_dataset = tgt_dataset
+        self.backtranslation_fn = backtranslation_fn
+        self.output_collater = (
+            output_collater if output_collater is not None else tgt_dataset.collater
+        )
+        self.cuda = cuda if torch.cuda.is_available() else False
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+    def __getitem__(self, index):
+        """
+        Returns a single sample from *tgt_dataset*. Note that backtranslation is
+        not applied in this step; use :func:`collater` instead to backtranslate
+        a batch of samples.
+        """
+        return self.tgt_dataset[index]
+    def __len__(self):
+        return len(self.tgt_dataset)
+    def set_backtranslation_fn(self, backtranslation_fn):
+        self.backtranslation_fn = backtranslation_fn
+    def collater(self, samples):
+        """Merge and backtranslate a list of samples to form a mini-batch.
+        Using the samples from *tgt_dataset*, load a collated target sample to
+        feed to the backtranslation model. Then take the backtranslation with
+        the best score as the source and the original input as the target.
+        Note: we expect *tgt_dataset* to provide a function `collater()` that
+        will collate samples into the format expected by *backtranslation_fn*.
+        After backtranslation, we will feed the new list of samples (i.e., the
+        `(backtranslated source, original source)` pairs) to *output_collater*
+        and return the result.
+        Args:
+            samples (List[dict]): samples to backtranslate and collate
+        Returns:
+            dict: a mini-batch with keys coming from *output_collater*
+        """
+        if samples[0].get("is_dummy", False):
+            return samples
+        samples = backtranslate_samples(
+            samples=samples,
+            collate_fn=self.tgt_dataset.collater,
+            generate_fn=(lambda net_input: self.backtranslation_fn(net_input)),
+            cuda=self.cuda,
+        )
+        return self.output_collater(samples)
+    def num_tokens(self, index):
+        """Just use the tgt dataset num_tokens"""
+        return self.tgt_dataset.num_tokens(index)
+    def ordered_indices(self):
+        """Just use the tgt dataset ordered_indices"""
+        return self.tgt_dataset.ordered_indices()
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used
+        when filtering a dataset with ``--max-positions``.
+        Note: we use *tgt_dataset* to approximate the length of the source
+        sentence, since we do not know the actual length until after
+        backtranslation.
+        """
+        tgt_size = self.tgt_dataset.size(index)[0]
+        return (tgt_size, tgt_size)
+    @property
+    def supports_prefetch(self):
+        return getattr(self.tgt_dataset, "supports_prefetch", False)
+    def prefetch(self, indices):
+        return self.tgt_dataset.prefetch(indices)

modules/voice_conversion/fairseq/data/base_wrapper_dataset.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.utils.data.dataloader import default_collate
+from . import FairseqDataset
+class BaseWrapperDataset(FairseqDataset):
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+    def __getitem__(self, index):
+        return self.dataset[index]
+    def __len__(self):
+        return len(self.dataset)
+    def collater(self, samples):
+        if hasattr(self.dataset, "collater"):
+            return self.dataset.collater(samples)
+        else:
+            return default_collate(samples)
+    @property
+    def sizes(self):
+        return self.dataset.sizes
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+    def size(self, index):
+        return self.dataset.size(index)
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+    def attr(self, attr: str, index: int):
+        return self.dataset.attr(attr, index)
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
+    def get_batch_shapes(self):
+        return self.dataset.get_batch_shapes()
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        return self.dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+        )
+    def filter_indices_by_size(self, indices, max_sizes):
+        return self.dataset.filter_indices_by_size(indices, max_sizes)
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return self.dataset.can_reuse_epoch_itr_across_epochs
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)

modules/voice_conversion/fairseq/data/bucket_pad_length_dataset.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch.nn.functional as F
+from fairseq.data import BaseWrapperDataset
+from fairseq.data.data_utils import get_buckets, get_bucketed_sizes
+class BucketPadLengthDataset(BaseWrapperDataset):
+    """
+    Bucket and pad item lengths to the nearest bucket size. This can be used to
+    reduce the number of unique batch shapes, which is important on TPUs since
+    each new batch shape requires a recompilation.
+    Args:
+        dataset (FairseqDatset): dataset to bucket
+        sizes (List[int]): all item sizes
+        num_buckets (int): number of buckets to create
+        pad_idx (int): padding symbol
+        left_pad (bool): if True, pad on the left; otherwise right pad
+    """
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        num_buckets,
+        pad_idx,
+        left_pad,
+        tensor_key=None,
+    ):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+        assert num_buckets > 0
+        self.buckets = get_buckets(sizes, num_buckets)
+        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
+        self._tensor_key = tensor_key
+    def _set_tensor(self, item, val):
+        if self._tensor_key is None:
+            return val
+        item[self._tensor_key] = val
+        return item
+    def _get_tensor(self, item):
+        if self._tensor_key is None:
+            return item
+        return item[self._tensor_key]
+    def _pad(self, tensor, bucket_size, dim=-1):
+        num_pad = bucket_size - tensor.size(dim)
+        return F.pad(
+            tensor,
+            (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad),
+            value=self.pad_idx,
+        )
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        bucket_size = self._bucketed_sizes[index]
+        tensor = self._get_tensor(item)
+        padded = self._pad(tensor, bucket_size)
+        return self._set_tensor(item, padded)
+    @property
+    def sizes(self):
+        return self._bucketed_sizes
+    def num_tokens(self, index):
+        return self._bucketed_sizes[index]
+    def size(self, index):
+        return self._bucketed_sizes[index]

modules/voice_conversion/fairseq/data/codedataset.py ADDED Viewed

	@@ -0,0 +1,576 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import logging
+import os
+import random
+from pathlib import Path
+import numpy as np
+import torch
+import torch.utils.data
+from . import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+F0_FRAME_SPACE = 0.005  # sec
+logger = logging.getLogger(__name__)
+class ExpressiveCodeDataConfig(object):
+    def __init__(self, json_path):
+        with open(json_path, "r") as f:
+            self.config = json.load(f)
+        self._manifests = self.config["manifests"]
+    @property
+    def manifests(self):
+        return self._manifests
+    @property
+    def n_units(self):
+        return self.config["n_units"]
+    @property
+    def sampling_rate(self):
+        return self.config["sampling_rate"]
+    @property
+    def code_hop_size(self):
+        return self.config["code_hop_size"]
+    @property
+    def f0_stats(self):
+        """pre-computed f0 statistics path"""
+        return self.config.get("f0_stats", None)
+    @property
+    def f0_vq_type(self):
+        """naive or precomp"""
+        return self.config["f0_vq_type"]
+    @property
+    def f0_vq_name(self):
+        return self.config["f0_vq_name"]
+    def get_f0_vq_naive_quantizer(self, log, norm_mean, norm_std):
+        key = "log" if log else "linear"
+        if norm_mean and norm_std:
+            key += "_mean_std_norm"
+        elif norm_mean:
+            key += "_mean_norm"
+        else:
+            key += "_none_norm"
+        return self.config["f0_vq_naive_quantizer"][key]
+    @property
+    def f0_vq_n_units(self):
+        return self.config["f0_vq_n_units"]
+    @property
+    def multispkr(self):
+        """how to parse speaker label from audio path"""
+        return self.config.get("multispkr", None)
+def get_f0(audio, rate=16000):
+    try:
+        import amfm_decompy.basic_tools as basic
+        import amfm_decompy.pYAAPT as pYAAPT
+        from librosa.util import normalize
+    except ImportError:
+        raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)."
+    assert audio.ndim == 1
+    frame_length = 20.0  # ms
+    to_pad = int(frame_length / 1000 * rate) // 2
+    audio = normalize(audio) * 0.95
+    audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
+    audio = basic.SignalObj(audio, rate)
+    pitch = pYAAPT.yaapt(
+        audio,
+        frame_length=frame_length,
+        frame_space=F0_FRAME_SPACE * 1000,
+        nccf_thresh1=0.25,
+        tda_frame_length=25.0,
+    )
+    f0 = pitch.samp_values
+    return f0
+def interpolate_f0(f0):
+    try:
+        from scipy.interpolate import interp1d
+    except ImportError:
+        raise "Please install scipy (`pip install scipy`)"
+    orig_t = np.arange(f0.shape[0])
+    f0_interp = f0[:]
+    ii = f0_interp != 0
+    if ii.sum() > 1:
+        f0_interp = interp1d(
+            orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0
+        )(orig_t)
+        f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device)
+    return f0_interp
+def naive_quantize(x, edges):
+    bin_idx = (x.view(-1, 1) > edges.view(1, -1)).long().sum(dim=1)
+    return bin_idx
+def load_wav(full_path):
+    try:
+        import soundfile as sf
+    except ImportError:
+        raise "Please install soundfile (`pip install SoundFile`)"
+    data, sampling_rate = sf.read(full_path)
+    return data, sampling_rate
+def parse_code(code_str, dictionary, append_eos):
+    code, duration = torch.unique_consecutive(
+        torch.ShortTensor(list(map(int, code_str.split()))), return_counts=True
+    )
+    code = " ".join(map(str, code.tolist()))
+    code = dictionary.encode_line(code, append_eos).short()
+    if append_eos:
+        duration = torch.cat((duration, duration.new_zeros((1,))), dim=0)  # eos
+    duration = duration.short()
+    return code, duration
+def parse_manifest(manifest, dictionary):
+    audio_files = []
+    codes = []
+    durations = []
+    speakers = []
+    with open(manifest) as info:
+        for line in info.readlines():
+            sample = eval(line.strip())
+            if "cpc_km100" in sample:
+                k = "cpc_km100"
+            elif "hubert_km100" in sample:
+                k = "hubert_km100"
+            elif "phone" in sample:
+                k = "phone"
+            else:
+                assert False, "unknown format"
+            code = sample[k]
+            code, duration = parse_code(code, dictionary, append_eos=True)
+            codes.append(code)
+            durations.append(duration)
+            audio_files.append(sample["audio"])
+            speakers.append(sample.get("speaker", None))
+    return audio_files, codes, durations, speakers
+def parse_speaker(path, method):
+    if type(path) == str:
+        path = Path(path)
+    if method == "parent_name":
+        return path.parent.name
+    elif method == "parent_parent_name":
+        return path.parent.parent.name
+    elif method == "_":
+        return path.name.split("_")[0]
+    elif method == "single":
+        return "A"
+    elif callable(method):
+        return method(path)
+    else:
+        raise NotImplementedError()
+def get_f0_by_filename(filename, tgt_sampling_rate):
+    audio, sampling_rate = load_wav(filename)
+    if sampling_rate != tgt_sampling_rate:
+        raise ValueError(
+            "{} SR doesn't match target {} SR".format(sampling_rate, tgt_sampling_rate)
+        )
+    # compute un-interpolated f0, and use Ann's interp in __getitem__ if set
+    f0 = get_f0(audio, rate=tgt_sampling_rate)
+    f0 = torch.from_numpy(f0.astype(np.float32))
+    return f0
+def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1):
+    code_len = durations.sum()
+    targ_len = int(f0_code_ratio * code_len)
+    diff = f0.size(0) - targ_len
+    assert abs(diff) <= tol, (
+        f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|"
+        f" > {tol} (dur=\n{durations})"
+    )
+    if diff > 0:
+        f0 = f0[:targ_len]
+    elif diff < 0:
+        f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0)
+    f0_offset = 0.0
+    seg_f0s = []
+    for dur in durations:
+        f0_dur = dur.item() * f0_code_ratio
+        seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)]
+        seg_f0 = seg_f0[seg_f0 != 0]
+        if len(seg_f0) == 0:
+            seg_f0 = torch.tensor(0).type(seg_f0.type())
+        else:
+            seg_f0 = seg_f0.mean()
+        seg_f0s.append(seg_f0)
+        f0_offset += f0_dur
+    assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}"
+    return torch.tensor(seg_f0s)
+class Paddings(object):
+    def __init__(self, code_val, dur_val=0, f0_val=-2.0):
+        self.code = code_val
+        self.dur = dur_val
+        self.f0 = f0_val
+class Shifts(object):
+    def __init__(self, shifts_str, pads):
+        self._shifts = list(map(int, shifts_str.split(",")))
+        assert len(self._shifts) == 2, self._shifts
+        assert all(s >= 0 for s in self._shifts)
+        self.extra_length = max(s for s in self._shifts)
+        self.pads = pads
+    @property
+    def dur(self):
+        return self._shifts[0]
+    @property
+    def f0(self):
+        return self._shifts[1]
+    @staticmethod
+    def shift_one(seq, left_pad_num, right_pad_num, pad):
+        assert seq.ndim == 1
+        bos = seq.new_full((left_pad_num,), pad)
+        eos = seq.new_full((right_pad_num,), pad)
+        seq = torch.cat([bos, seq, eos])
+        mask = torch.ones_like(seq).bool()
+        mask[left_pad_num : len(seq) - right_pad_num] = 0
+        return seq, mask
+    def __call__(self, code, dur, f0):
+        if self.extra_length == 0:
+            code_mask = torch.zeros_like(code).bool()
+            dur_mask = torch.zeros_like(dur).bool()
+            f0_mask = torch.zeros_like(f0).bool()
+            return code, code_mask, dur, dur_mask, f0, f0_mask
+        code, code_mask = self.shift_one(code, 0, self.extra_length, self.pads.code)
+        dur, dur_mask = self.shift_one(
+            dur, self.dur, self.extra_length - self.dur, self.pads.dur
+        )
+        f0, f0_mask = self.shift_one(
+            f0, self.f0, self.extra_length - self.f0, self.pads.f0
+        )
+        return code, code_mask, dur, dur_mask, f0, f0_mask
+class CodeDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest,
+        dictionary,
+        dur_dictionary,
+        f0_dictionary,
+        config,
+        discrete_dur,
+        discrete_f0,
+        log_f0,
+        normalize_f0_mean,
+        normalize_f0_std,
+        interpolate_f0,
+        return_filename=False,
+        strip_filename=True,
+        shifts="0,0",
+        return_continuous_f0=False,
+    ):
+        random.seed(1234)
+        self.dictionary = dictionary
+        self.dur_dictionary = dur_dictionary
+        self.f0_dictionary = f0_dictionary
+        self.config = config
+        # duration config
+        self.discrete_dur = discrete_dur
+        # pitch config
+        self.discrete_f0 = discrete_f0
+        self.log_f0 = log_f0
+        self.normalize_f0_mean = normalize_f0_mean
+        self.normalize_f0_std = normalize_f0_std
+        self.interpolate_f0 = interpolate_f0
+        self.return_filename = return_filename
+        self.strip_filename = strip_filename
+        self.f0_code_ratio = config.code_hop_size / (
+            config.sampling_rate * F0_FRAME_SPACE
+        )
+        # use lazy loading to avoid sharing file handlers across workers
+        self.manifest = manifest
+        self._codes = None
+        self._durs = None
+        self._f0s = None
+        with open(f"{manifest}.leng.txt", "r") as f:
+            lengs = [int(line.rstrip()) for line in f]
+            edges = np.cumsum([0] + lengs)
+            self.starts, self.ends = edges[:-1], edges[1:]
+        with open(f"{manifest}.path.txt", "r") as f:
+            self.file_names = [line.rstrip() for line in f]
+        logger.info(f"num entries: {len(self.starts)}")
+        if os.path.exists(f"{manifest}.f0_stat.pt"):
+            self.f0_stats = torch.load(f"{manifest}.f0_stat.pt")
+        elif config.f0_stats:
+            self.f0_stats = torch.load(config.f0_stats)
+        self.multispkr = config.multispkr
+        if config.multispkr:
+            with open(f"{manifest}.speaker.txt", "r") as f:
+                self.spkrs = [line.rstrip() for line in f]
+            self.id_to_spkr = sorted(self.spkrs)
+            self.spkr_to_id = {k: v for v, k in enumerate(self.id_to_spkr)}
+        self.pads = Paddings(
+            dictionary.pad(),
+            0,  # use 0 for duration padding
+            f0_dictionary.pad() if discrete_f0 else -5.0,
+        )
+        self.shifts = Shifts(shifts, pads=self.pads)
+        self.return_continuous_f0 = return_continuous_f0
+    def get_data_handlers(self):
+        logging.info(f"loading data for {self.manifest}")
+        self._codes = np.load(f"{self.manifest}.code.npy", mmap_mode="r")
+        self._durs = np.load(f"{self.manifest}.dur.npy", mmap_mode="r")
+        if self.discrete_f0:
+            if self.config.f0_vq_type == "precomp":
+                self._f0s = np.load(
+                    f"{self.manifest}.{self.config.f0_vq_name}.npy", mmap_mode="r"
+                )
+            elif self.config.f0_vq_type == "naive":
+                self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r")
+                quantizers_path = self.config.get_f0_vq_naive_quantizer(
+                    self.log_f0, self.normalize_f0_mean, self.normalize_f0_std
+                )
+                quantizers = torch.load(quantizers_path)
+                n_units = self.config.f0_vq_n_units
+                self._f0_quantizer = torch.from_numpy(quantizers[n_units])
+            else:
+                raise ValueError(f"f0_vq_type {self.config.f0_vq_type} not supported")
+        else:
+            self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r")
+    def preprocess_f0(self, f0, stats):
+        """
+        1. interpolate
+        2. log transform (keep unvoiced frame 0)
+        """
+        # TODO: change this to be dependent on config for naive quantizer
+        f0 = f0.clone()
+        if self.interpolate_f0:
+            f0 = interpolate_f0(f0)
+        mask = f0 != 0  # only process voiced frames
+        if self.log_f0:
+            f0[mask] = f0[mask].log()
+        if self.normalize_f0_mean:
+            mean = stats["logf0_mean"] if self.log_f0 else stats["f0_mean"]
+            f0[mask] = f0[mask] - mean
+        if self.normalize_f0_std:
+            std = stats["logf0_std"] if self.log_f0 else stats["f0_std"]
+            f0[mask] = f0[mask] / std
+        return f0
+    def _get_raw_item(self, index):
+        start, end = self.starts[index], self.ends[index]
+        if self._codes is None:
+            self.get_data_handlers()
+        code = torch.from_numpy(np.array(self._codes[start:end])).long()
+        dur = torch.from_numpy(np.array(self._durs[start:end]))
+        f0 = torch.from_numpy(np.array(self._f0s[start:end]))
+        return code, dur, f0
+    def __getitem__(self, index):
+        code, dur, f0 = self._get_raw_item(index)
+        code = torch.cat([code.new([self.dictionary.bos()]), code])
+        # use 0 for eos and bos
+        dur = torch.cat([dur.new([0]), dur])
+        if self.discrete_dur:
+            dur = self.dur_dictionary.encode_line(
+                " ".join(map(str, dur.tolist())), append_eos=False
+            ).long()
+        else:
+            dur = dur.float()
+        # TODO: find a more elegant approach
+        raw_f0 = None
+        if self.discrete_f0:
+            if self.config.f0_vq_type == "precomp":
+                f0 = self.f0_dictionary.encode_line(
+                    " ".join(map(str, f0.tolist())), append_eos=False
+                ).long()
+            else:
+                f0 = f0.float()
+                f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]])
+                if self.return_continuous_f0:
+                    raw_f0 = f0
+                    raw_f0 = torch.cat([raw_f0.new([self.f0_dictionary.bos()]), raw_f0])
+                f0 = naive_quantize(f0, self._f0_quantizer)
+            f0 = torch.cat([f0.new([self.f0_dictionary.bos()]), f0])
+        else:
+            f0 = f0.float()
+            if self.multispkr:
+                f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]])
+            else:
+                f0 = self.preprocess_f0(f0, self.f0_stats)
+            f0 = torch.cat([f0.new([0]), f0])
+        if raw_f0 is not None:
+            *_, raw_f0, raw_f0_mask = self.shifts(code, dur, raw_f0)
+        else:
+            raw_f0_mask = None
+        code, code_mask, dur, dur_mask, f0, f0_mask = self.shifts(code, dur, f0)
+        if raw_f0_mask is not None:
+            assert (raw_f0_mask == f0_mask).all()
+        # is a padded frame if either input or output is padded
+        feats = {
+            "source": code[:-1],
+            "target": code[1:],
+            "mask": code_mask[1:].logical_or(code_mask[:-1]),
+            "dur_source": dur[:-1],
+            "dur_target": dur[1:],
+            "dur_mask": dur_mask[1:].logical_or(dur_mask[:-1]),
+            "f0_source": f0[:-1],
+            "f0_target": f0[1:],
+            "f0_mask": f0_mask[1:].logical_or(f0_mask[:-1]),
+        }
+        if raw_f0 is not None:
+            feats["raw_f0"] = raw_f0[1:]
+        if self.return_filename:
+            fname = self.file_names[index]
+            feats["filename"] = (
+                fname if not self.strip_filename else Path(fname).with_suffix("").name
+            )
+        return feats
+    def __len__(self):
+        return len(self.starts)
+    def size(self, index):
+        return self.ends[index] - self.starts[index] + self.shifts.extra_length
+    def num_tokens(self, index):
+        return self.size(index)
+    def collater(self, samples):
+        pad_idx, eos_idx = self.dictionary.pad(), self.dictionary.eos()
+        if len(samples) == 0:
+            return {}
+        src_tokens = data_utils.collate_tokens(
+            [s["source"] for s in samples], pad_idx, eos_idx, left_pad=False
+        )
+        tgt_tokens = data_utils.collate_tokens(
+            [s["target"] for s in samples],
+            pad_idx=pad_idx,
+            eos_idx=pad_idx,  # appending padding, eos is there already
+            left_pad=False,
+        )
+        src_durs, tgt_durs = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=self.pads.dur,
+                eos_idx=self.pads.dur,
+                left_pad=False,
+            )
+            for k in ["dur_source", "dur_target"]
+        ]
+        src_f0s, tgt_f0s = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=self.pads.f0,
+                eos_idx=self.pads.f0,
+                left_pad=False,
+            )
+            for k in ["f0_source", "f0_target"]
+        ]
+        mask, dur_mask, f0_mask = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=1,
+                eos_idx=1,
+                left_pad=False,
+            )
+            for k in ["mask", "dur_mask", "f0_mask"]
+        ]
+        src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
+        n_tokens = sum(len(s["source"]) for s in samples)
+        result = {
+            "nsentences": len(samples),
+            "ntokens": n_tokens,
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "dur_src": src_durs,
+                "f0_src": src_f0s,
+            },
+            "target": tgt_tokens,
+            "dur_target": tgt_durs,
+            "f0_target": tgt_f0s,
+            "mask": mask,
+            "dur_mask": dur_mask,
+            "f0_mask": f0_mask,
+        }
+        if "filename" in samples[0]:
+            result["filename"] = [s["filename"] for s in samples]
+        # TODO: remove this hack into the inference dataset
+        if "prefix" in samples[0]:
+            result["prefix"] = [s["prefix"] for s in samples]
+        if "raw_f0" in samples[0]:
+            raw_f0s = data_utils.collate_tokens(
+                [s["raw_f0"] for s in samples],
+                pad_idx=self.pads.f0,
+                eos_idx=self.pads.f0,
+                left_pad=False,
+            )
+            result["raw_f0"] = raw_f0s
+        return result